From 9e45ab56bd165609118989c0d1bec309c3754560 Mon Sep 17 00:00:00 2001
From: Chunseok Lee <chunseok.lee@samsung.com>
Date: Thu, 30 Jul 2020 11:40:16 +0900
Subject: [PATCH] patch for rebase master on release/1.7.0

Change-Id: Id38b617d325ef7e854995a47f032bdf482a779b3
---
 .ahub/tcchecker-tca/config.yaml                    |    43 +
 compiler/.ahub/tcchecker-tca/config.yaml           |    54 +
 compiler/bcq-tools/CMakeLists.txt                  |    27 +
 compiler/bcq-tools/README.md                       |    78 +
 compiler/bcq-tools/generate_bcq_output_arrays      |    90 +
 compiler/bcq-tools/preserve_bcq_info               |   116 +
 compiler/circle-quantizer/CMakeLists.txt           |     1 +
 compiler/circle-quantizer/requires.cmake           |     1 +
 compiler/circle-quantizer/src/CircleQuantizer.cpp  |    18 +-
 compiler/circle-tensordump/driver/Driver.cpp       |     2 +-
 compiler/circle-tensordump/src/Dump.cpp            |    48 +-
 compiler/circle-verify/src/Driver.cpp              |     2 +-
 .../circle2circle-dredd-recipe-test/CMakeLists.txt |    93 +-
 .../circle2circle-dredd-recipe-test/requires.cmake |     4 +-
 compiler/circle2circle-dredd-recipe-test/test.lst  |     3 +-
 .../circle2circle-dredd-recipe-test/testall.sh     |    13 +-
 compiler/circle2circle/CMakeLists.txt              |     2 +
 compiler/circle2circle/requires.cmake              |     1 +
 compiler/circle2circle/src/Circle2Circle.cpp       |    14 +
 compiler/circlechef/CMakeLists.txt                 |     4 +-
 compiler/circlechef/circle/src/RecipeChef.cpp      |     2 +
 compiler/circlechef/core/src/ModelChef.cpp         |     1 +
 compiler/circlechef/proto/circlechef.proto         |     1 +
 compiler/circlechef/tools/file/Driver.cpp          |     2 +-
 compiler/circlechef/tools/reverse/Driver.cpp       |     2 +-
 compiler/circledump/driver/Driver.cpp              |     2 +-
 compiler/circledump/src/OpPrinter.cpp              |    15 +
 compiler/common-artifacts/exclude.lst              |    31 +-
 compiler/hermes/src/hermes.test.cpp                |    25 +-
 compiler/locomotiv/src/Node/BiasEncode.test.cpp    |    14 +-
 compiler/locomotiv/src/Node/MatMul.test.cpp        |     4 +
 compiler/locop/src/FormattedGraph.test.cpp         |     2 +
 compiler/locop/src/FormattedTensorShape.test.cpp   |     2 +
 .../include/luci_interpreter/core/Tensor.h         |     9 +-
 compiler/luci-interpreter/src/core/KernelParams.h  |     5 +
 .../luci-interpreter/src/kernels/CMakeLists.txt    |     9 +
 .../luci-interpreter/src/kernels/DepthToSpace.cpp  |    90 +
 .../luci-interpreter/src/kernels/DepthToSpace.h    |    45 +
 .../src/kernels/DepthToSpace.test.cpp              |    60 +
 .../src/kernels/L2Normalize.test.cpp               |     9 +-
 .../src/kernels/LeakyRelu.test.cpp                 |    11 +-
 .../luci-interpreter/src/kernels/Logistic.test.cpp |     6 +-
 compiler/luci-interpreter/src/kernels/Reverse.cpp  |    81 +
 compiler/luci-interpreter/src/kernels/Reverse.h    |    43 +
 .../luci-interpreter/src/kernels/Reverse.test.cpp  |    66 +
 compiler/luci-interpreter/src/kernels/Slice.cpp    |   149 +
 compiler/luci-interpreter/src/kernels/Slice.h      |    44 +
 .../luci-interpreter/src/kernels/Slice.test.cpp    |    64 +
 .../src/kernels/TransposeConv.test.cpp             |    23 +-
 .../luci-interpreter/src/loader/CMakeLists.txt     |     7 +
 .../luci-interpreter/src/loader/GraphLoader.cpp    |    23 +-
 compiler/luci-interpreter/src/loader/GraphLoader.h |    18 +-
 .../luci-interpreter/src/loader/KernelBuilder.cpp  |   108 +-
 .../luci-interpreter/src/loader/KernelBuilder.h    |    17 +-
 .../src/loader/KernelBuilder.test.cpp              |   743 +
 .../luci-interpreter/src/loader/ModuleLoader.cpp   |     7 +-
 .../luci-interpreter/src/loader/ModuleLoader.h     |     5 -
 compiler/luci-value-test/evalverify.sh             |     6 +-
 compiler/luci-value-test/test.lst                  |   110 +-
 .../luci/export/src/CircleOperationExporter.cpp    |     2 +-
 compiler/luci/export/src/CircleTensorExporter.cpp  |     5 +-
 compiler/luci/import/src/CircleReader.cpp          |     2 +
 compiler/luci/import/src/Importer.test.cpp         |     7 +-
 compiler/luci/import/src/Nodes/CircleLogistic.cpp  |    14 -
 .../luci/import/src/Nodes/CircleTransposeConv.cpp  |    18 +
 compiler/luci/lang/include/luci/IR/CircleNodes.lst |     1 +
 .../luci/lang/include/luci/IR/CircleQuantParam.h   |     1 +
 compiler/luci/lang/src/Module.test.cpp             |     2 +-
 compiler/luci/lang/src/Nodes/CircleCustom.test.cpp |     7 +-
 compiler/luci/lang/src/Nodes/CircleIf.test.cpp     |     4 +
 compiler/luci/lang/src/Nodes/CircleWhile.test.cpp  |     4 +
 compiler/luci/pass/src/CircleOptimizer.cpp         |     4 +-
 compiler/luci/pass/src/FuseBCQPass.cpp             |   426 +-
 compiler/luci/pass/src/QuantizationUtils.cpp       |     7 +
 compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp  |    21 +-
 compiler/luci/tests/test.lst                       |     9 +
 compiler/one-cmds/one-codegen                      |    25 +-
 compiler/one-cmds/one-import                       |    25 +-
 compiler/one-cmds/one-import-tf                    |    30 +-
 compiler/one-cmds/one-import-tflite                |    20 +-
 compiler/one-cmds/one-optimize                     |    20 +-
 compiler/one-cmds/one-pack                         |    23 +-
 compiler/one-cmds/one-quantize                     |    23 +-
 compiler/one-cmds/requires.cmake                   |     1 +
 compiler/record-minmax/CMakeLists.txt              |     5 +
 compiler/record-minmax/driver/Driver.cpp           |    16 +-
 compiler/record-minmax/requires.cmake              |     1 +
 compiler/record-minmax/src/HDF5Importer.cpp        |     1 +
 compiler/record-minmax/src/MinMaxObserver.cpp      |     3 +-
 compiler/record-minmax/src/RecordMinMax.cpp        |     2 +-
 .../record-minmax/tests/RecordFunction.test.cpp    |    14 +
 compiler/tfl-verify/CMakeLists.txt                 |     1 +
 compiler/tfl-verify/requires.cmake                 |     1 +
 compiler/tfl-verify/src/Driver.cpp                 |    19 +-
 compiler/tflchef/core/src/ModelChef.cpp            |     1 +
 compiler/tflchef/proto/tflchef.proto               |     1 +
 compiler/tflchef/tflite/src/RecipeChef.cpp         |     2 +
 compiler/tflchef/tools/file/Driver.cpp             |     2 +-
 compiler/tflchef/tools/reverse/Driver.cpp          |     2 +-
 compiler/tfldump/driver/Driver.cpp                 |     2 +-
 compiler/tflite2circle/CMakeLists.txt              |     1 +
 compiler/tflite2circle/driver/Driver.cpp           |    17 +-
 compiler/tflite2circle/requires.cmake              |     1 +
 compiler/vconone/CMakeLists.txt                    |    31 +
 compiler/vconone/README.md                         |    14 +
 compiler/vconone/driver/driver.cpp                 |    36 +
 compiler/vconone/include/vconone/vconone.h         |    61 +
 compiler/vconone/src/version.cpp                   |    63 +
 compiler/vconone/src/version.test.cpp              |    49 +
 compiler/vconone/version_cfg.h.in                  |    22 +
 .../core/CL/kernels/CLArgOperationKernel.h         |   124 -
 .../arm_compute/core/CL/kernels/CLCastKernel.h     |   121 -
 .../core/CL/kernels/CLDepthToSpaceKernel.h         |    82 -
 .../CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.h  |   117 -
 .../arm_compute/core/CL/kernels/CLPReLUKernel.h    |    83 -
 .../core/CL/kernels/CLSpaceToDepthKernel.h         |    82 -
 .../kernels/CLTransposeConvLayerUpsampleKernel.h   |   109 -
 .../core/CPP/kernels/CPPUpsampleKernelEx.h         |    88 -
 .../arm_compute/core/NEON/kernels/NECastKernel.h   |    96 -
 .../NEON/kernels/NEDepthToSpaceLayerKernelEx.h     |    96 -
 .../core/NEON/kernels/NEElementwiseUnaryKernelEx.h |   118 -
 .../arm_compute/core/NEON/kernels/NEPReLUKernel.h  |   100 -
 .../NEON/kernels/NESpaceToDepthLayerKernelEx.h     |    97 -
 .../arm_compute/runtime/CL/CLFunctionsEx.h         |    11 -
 .../runtime/CL/functions/CLArgOperation.h          |   129 -
 .../runtime/CL/functions/CLBatchToSpaceND.h        |    69 -
 .../arm_compute/runtime/CL/functions/CLCast.h      |    75 -
 .../runtime/CL/functions/CLDepthToSpace.h          |    68 -
 .../CL/functions/CLDirectTransposeConvLayer.h      |   201 +
 .../CL/functions/CLFullyConnectedHybridLayer.h     |     4 +-
 .../CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.h  |   142 -
 .../runtime/CL/functions/CLLogicalNot.h            |    62 -
 .../arm_compute/runtime/CL/functions/CLPReLU.h     |    64 -
 .../runtime/CL/functions/CLPixelWiseDivision.h     |   103 -
 .../runtime/CL/functions/CLRNNLayerEx.h            |   120 -
 .../runtime/CL/functions/CLSpaceToDepth.h          |    68 -
 .../runtime/CL/functions/CLStridedSliceEx.h        |    81 -
 .../runtime/CL/functions/CLTransposeConvLayer.h    |   176 +-
 .../CL/functions/CLTransposeConvLayerUpsample.h    |   102 -
 .../runtime/CPP/functions/CPPUpsampleEx.h          |    65 -
 .../arm_compute/runtime/NEON/NEFunctionsEx.h       |     7 -
 .../arm_compute/runtime/NEON/functions/NECast.h    |    79 -
 .../runtime/NEON/functions/NEDepthToSpaceLayerEx.h |    78 -
 .../NEON/functions/NEElementwiseUnaryLayerEx.h     |    70 -
 .../NEON/functions/NEFullyConnectedHybridLayer.h   |     4 +-
 .../functions/NEGEMMLowpMatrixMultiplyCoreEx.h     |   170 -
 .../arm_compute/runtime/NEON/functions/NEPReLU.h   |    63 -
 .../runtime/NEON/functions/NERNNLayerEx.h          |   130 -
 .../runtime/NEON/functions/NEReduceMeanEx.h        |    99 -
 .../runtime/NEON/functions/NESpaceToBatchLayerEx.h |   136 -
 .../runtime/NEON/functions/NESpaceToDepthLayerEx.h |    79 -
 .../runtime/NEON/functions/NETransposeConvLayer.h  |    68 +-
 .../ARMComputeEx/src/core/CL/CLKernelLibrary.cpp   |    39 -
 .../src/core/CL/cl_kernels/arg_operation.cl        |   137 -
 .../core/CL/cl_kernels/arithmetic_op_quantized.cl  |   191 -
 .../ARMComputeEx/src/core/CL/cl_kernels/cast.cl    |   233 -
 .../src/core/CL/cl_kernels/depth_to_space.cl       |   185 -
 .../ARMComputeEx/src/core/CL/cl_kernels/helpers.h  |   206 +-
 .../src/core/CL/cl_kernels/helpers_asymm.h         |   185 +-
 .../ARMComputeEx/src/core/CL/cl_kernels/prelu.cl   |   120 -
 .../src/core/CL/cl_kernels/prelu_quantized.cl      |   138 -
 .../src/core/CL/cl_kernels/space_to_depth.cl       |   185 -
 .../src/core/CL/kernels/CLArgOperationKernel.cpp   |   181 -
 .../core/CL/kernels/CLBinaryLogicalOpKernel.cpp    |     1 +
 .../src/core/CL/kernels/CLCastKernel.cpp           |   132 -
 .../src/core/CL/kernels/CLDepthToSpaceKernel.cpp   |   140 -
 .../core/CL/kernels/CLEmbeddingLookupKernel.cpp    |     1 +
 .../kernels/CLGEMMLowpMatrixMultiplyKernelEx.cpp   |   372 -
 .../src/core/CL/kernels/CLGatherExKernel.cpp       |     1 +
 .../core/CL/kernels/CLHashtableLookupKernel.cpp    |     3 +-
 .../CLInstanceNormalizationLayerKernelEx.cpp       |     2 +-
 .../CL/kernels/CLMultiplyScaleFactorKernel.cpp     |     1 +
 .../src/core/CL/kernels/CLNegKernel.cpp            |     1 +
 .../src/core/CL/kernels/CLPReLUKernel.cpp          |   210 -
 .../CL/kernels/CLQuantizationSymmetricKernel.cpp   |     3 +-
 .../core/CL/kernels/CLReduceOperationKernel.cpp    |     1 +
 .../core/CL/kernels/CLScaleFactorSymm8Kernel.cpp   |     1 +
 .../src/core/CL/kernels/CLSpaceToDepthKernel.cpp   |   148 -
 .../kernels/CLTransposeConvLayerUpsampleKernel.cpp |   188 -
 .../src/core/CPP/kernels/CPPUpsampleKernelEx.cpp   |   118 -
 .../src/core/NEON/kernels/NECastKernel.cpp         |   671 -
 .../NEON/kernels/NEDepthToSpaceLayerKernelEx.cpp   |   181 -
 .../NEON/kernels/NEElementwiseUnaryKernelEx.cpp    |   221 -
 .../src/core/NEON/kernels/NEPReLUKernel.cpp        |   291 -
 .../NEON/kernels/NEQuantizationSymmetricKernel.cpp |     2 +-
 .../NEON/kernels/NESpaceToDepthLayerKernelEx.cpp   |   181 -
 .../src/runtime/CL/functions/CLArgOperation.cpp    |   144 -
 .../src/runtime/CL/functions/CLBinaryLogicalOp.cpp |     2 +-
 .../src/runtime/CL/functions/CLCast.cpp            |    52 -
 .../src/runtime/CL/functions/CLDepthToSpace.cpp    |    52 -
 .../CL/functions/CLDirectTransposeConvLayer.cpp    |   267 +
 .../src/runtime/CL/functions/CLEmbeddingLookup.cpp |     2 +-
 .../CL/functions/CLFullyConnectedHybridLayer.cpp   |    16 +-
 .../CL/functions/CLFullyConnectedLayerEx.cpp       |     4 +-
 .../functions/CLFullyConnectedReshapingLayer.cpp   |    16 +-
 .../functions/CLGEMMLowpMatrixMultiplyCoreEx.cpp   |   180 -
 .../src/runtime/CL/functions/CLGatherEx.cpp        |     2 +-
 .../src/runtime/CL/functions/CLHashtableLookup.cpp |     2 +-
 .../functions/CLInstanceNormalizationLayerEx.cpp   |     2 +-
 .../src/runtime/CL/functions/CLPReLU.cpp           |    63 -
 .../src/runtime/CL/functions/CLRNNLayerEx.cpp      |   163 -
 .../src/runtime/CL/functions/CLReduceOperation.cpp |     8 +-
 .../src/runtime/CL/functions/CLSpaceToDepth.cpp    |    52 -
 .../runtime/CL/functions/CLTransposeConvLayer.cpp  |   250 +-
 .../CL/functions/CLTransposeConvLayerUpsample.cpp  |    92 -
 .../src/runtime/CPP/functions/CPPOneHotEx.cpp      |     4 +-
 .../src/runtime/CPP/functions/CPPUpsampleEx.cpp    |    53 -
 .../runtime/NEON/functions/NEActivationLayerEx.cpp |     4 +-
 .../NEON/functions/NEBinaryLogicalOperation.cpp    |     6 +-
 .../src/runtime/NEON/functions/NECast.cpp          |    60 -
 .../NEON/functions/NEDepthToSpaceLayerEx.cpp       |    63 -
 .../runtime/NEON/functions/NEEmbeddingLookup.cpp   |     4 +-
 .../NEON/functions/NEFullyConnectedHybridLayer.cpp |    14 +-
 .../functions/NEFullyConnectedReshapingLayer.cpp   |     7 +-
 .../functions/NEGEMMLowpMatrixMultiplyCoreEx.cpp   |   513 -
 .../src/runtime/NEON/functions/NEGatherEx.cpp      |     4 +-
 .../runtime/NEON/functions/NEHashtableLookup.cpp   |     4 +-
 .../src/runtime/NEON/functions/NEPReLU.cpp         |    55 -
 .../src/runtime/NEON/functions/NERNNLayerEx.cpp    |   161 -
 .../src/runtime/NEON/functions/NEReduceMeanEx.cpp  |   180 -
 .../NEON/functions/NESpaceToBatchLayerEx.cpp       |   114 -
 .../NEON/functions/NESpaceToDepthLayerEx.cpp       |    64 -
 .../NEON/functions/NETransposeConvLayer.cpp        |   231 +-
 compute/cker/CMakeLists.txt                        |     3 +
 compute/cker/include/cker/Types.h                  |    11 +
 compute/cker/include/cker/Utils.h                  |    62 +
 .../cker/include/cker/operation/FullyConnected.h   |    13 +-
 compute/cker/include/cker/operation/L2Normalize.h  |    94 +
 compute/cker/include/cker/operation/Logistic.h     |     9 -
 compute/cker/include/cker/operation/Pad.h          |    15 +-
 compute/cker/include/cker/operation/Quantize.h     |    47 +
 compute/cker/include/cker/operation/SpaceToDepth.h |    71 +
 compute/cker/include/cker/ruy/RuySupport.h         |     2 +-
 docs/howto/how-to-build-runtime.md                 |     6 +-
 docs/nnfw/howto/CrossBuildForAndroid.md            |     4 +-
 docs/runtime/core.md                               |     4 +-
 docs/runtime/heterogeneous-execution.md            |     4 +-
 infra/cmake/packages/ARMComputeSourceConfig.cmake  |     2 +-
 infra/cmake/packages/FlatBuffersConfig.cmake       |     3 +-
 infra/cmake/packages/HDF5Config.cmake              |     1 +
 infra/cmake/packages/Pybind11Config.cmake          |    21 +
 infra/cmake/packages/Pybind11SourceConfig.cmake    |    18 +
 infra/docker/Dockerfile                            |     3 +-
 infra/docker/Dockerfile.1804                       |     7 +-
 infra/nncc/CMakeLists.txt                          |     1 +
 infra/nncc/command/utcount                         |     2 +-
 .../packages/TensorFlowLite-2.2.0/CMakeLists.txt   |     2 +-
 infra/nnfw/config/gbs.conf                         |     6 +-
 infra/packaging/preset/20200630                    |    14 +-
 infra/packaging/res/tf2nnpkg.20200630              |    19 +-
 infra/scripts/build-tcm.sh                         |    24 +
 infra/scripts/compiler_modules.sh                  |     2 +-
 .../scripts/docker_build_cross_aarch64_runtime.sh  |     2 +-
 infra/scripts/docker_build_cross_arm_runtime.sh    |     2 +-
 .../docker_build_cross_arm_runtime_release.sh      |     2 +-
 infra/scripts/docker_build_cross_coverage.sh       |     2 +-
 infra/scripts/docker_build_nncc.sh                 |    10 +
 infra/scripts/docker_build_tizen_cross.sh          |     2 +-
 infra/scripts/docker_collect_nnpkg_resources.sh    |     2 +-
 infra/scripts/tizen_xu4_test.sh                    |     2 +-
 master_diff_1.7.0.patch                            | 30424 +++++++++++++++++++
 packaging/nnfw.spec                                |     2 +-
 .../AveragePool2D_U8_000/test.recipe               |    26 +
 .../AveragePool2D_U8_000/test.reverse              |     0
 .../DepthwiseConv2D_003/test.recipe                |    44 +
 .../DepthwiseConv2D_003/test.reverse               |     0
 .../DepthwiseConv2D_003/test.rule                  |     3 +
 .../DepthwiseConv2D_U8_001/test.recipe             |    61 +
 .../DepthwiseConv2D_U8_001/test.reverse            |     0
 .../L2Normalize_U8_000/test.recipe                 |    22 +
 .../L2Normalize_U8_000/test.reverse                |     0
 .../Logistic_U8_000/test.recipe                    |    19 +
 .../Logistic_U8_000/test.reverse                   |     0
 .../TransposeConv_000/test.recipe                  |     2 +-
 res/TensorFlowLiteRecipes/Unique_000/test.recipe   |    27 +
 res/TensorFlowLiteRecipes/Unique_000/test.reverse  |     0
 res/TensorFlowLiteRecipes/Unique_001/test.recipe   |    27 +
 res/TensorFlowLiteRecipes/Unique_001/test.reverse  |     0
 res/TensorFlowLiteRecipes/Unique_002/test.recipe   |    27 +
 res/TensorFlowLiteRecipes/Unique_002/test.reverse  |     0
 res/TensorFlowLiteRecipes/Unique_003/test.recipe   |    27 +
 res/TensorFlowLiteRecipes/Unique_003/test.reverse  |     0
 .../Unique_U8_000/test.recipe                      |    28 +
 .../Unique_U8_000/test.reverse                     |     0
 .../Unique_U8_001/test.recipe                      |    28 +
 .../Unique_U8_001/test.reverse                     |     0
 runtime/libs/benchmark/CMakeLists.txt              |     3 +-
 runtime/libs/benchmark/src/Result.cpp              |     2 +-
 runtime/onert/api/include/nnfw.h                   |    18 +-
 runtime/onert/api/src/nnfw_api.cc                  |     1 +
 runtime/onert/api/src/nnfw_api_internal.cc         |    31 +-
 runtime/onert/backend/acl_cl/KernelGenerator.cc    |   804 +-
 runtime/onert/backend/acl_common/AclKernelGen.h    |   269 +
 runtime/onert/backend/acl_neon/KernelGenerator.cc  |   777 +-
 runtime/onert/backend/cpu/ConstantInitializer.cc   |    35 +-
 runtime/onert/backend/cpu/ConstantInitializer.h    |     9 +
 runtime/onert/backend/cpu/KernelGenerator.cc       |   509 +-
 runtime/onert/backend/cpu/KernelGenerator.h        |     3 +
 runtime/onert/backend/cpu/StaticTensorManager.cc   |   104 +
 runtime/onert/backend/cpu/StaticTensorManager.h    |    61 +
 runtime/onert/backend/cpu/Tensor.h                 |    15 +-
 runtime/onert/backend/cpu/TensorBuilder.cc         |    18 +-
 runtime/onert/backend/cpu/TensorBuilder.h          |    13 +-
 runtime/onert/backend/cpu/ops/CompareLayer.cc      |   238 +-
 .../onert/backend/cpu/ops/FullyConnectedLayer.cc   |    35 +-
 .../onert/backend/cpu/ops/FullyConnectedLayer.h    |     3 +
 runtime/onert/backend/cpu/ops/L2NormLayer.cc       |    71 +
 runtime/onert/backend/cpu/ops/L2NormLayer.h        |    55 +
 runtime/onert/backend/cpu/ops/LogSoftMaxLayer.cc   |     4 +-
 runtime/onert/backend/cpu/ops/LogSoftMaxLayer.h    |     7 +-
 runtime/onert/backend/cpu/ops/OperationUtils.h     |    11 +
 runtime/onert/backend/cpu/ops/PadLayer.cc          |    25 +-
 runtime/onert/backend/cpu/ops/PadLayer.h           |     8 +-
 runtime/onert/backend/cpu/ops/QuantizeLayer.cc     |    63 +
 runtime/onert/backend/cpu/ops/QuantizeLayer.h      |    56 +
 runtime/onert/backend/cpu/ops/SliceLayer.cc        |    16 +-
 runtime/onert/backend/cpu/ops/SliceLayer.h         |     3 +-
 runtime/onert/backend/cpu/ops/SpaceToDepthLayer.cc |    70 +
 runtime/onert/backend/cpu/ops/SpaceToDepthLayer.h  |    54 +
 .../onert/core/include/backend/ITensorBuilder.h    |     4 +-
 .../onert/core/include/backend/ITensorRegistry.h   |    68 +-
 .../backend/cpu_common/StaticTensorManager.h       |     4 +-
 .../core/include/compiler/StaticShapeInference.h   |     1 +
 .../core/include/exec/DynamicShapeInference.h      |     1 +
 runtime/onert/core/include/ir/Operations.Include.h |     1 +
 runtime/onert/core/include/ir/Operations.lst       |     1 +
 .../onert/core/include/ir/operation/LogSoftmax.h   |     2 +-
 runtime/onert/core/include/ir/operation/Pad.h      |     2 +-
 runtime/onert/core/include/ir/operation/Quantize.h |    49 +
 .../backend/controlflow/DynamicTensorManager.cc    |    14 +-
 .../src/backend/controlflow/KernelGenerator.cc     |    22 +-
 .../core/src/backend/controlflow/TensorBuilder.cc  |     6 +-
 .../core/src/backend/controlflow/UserTensor.h      |     1 +
 .../src/backend/cpu_common/DynamicTensorManager.cc |    10 +-
 .../src/backend/cpu_common/StaticTensorManager.cc  |    28 +-
 runtime/onert/core/src/compiler/ExecutorFactory.cc |    37 +-
 runtime/onert/core/src/compiler/ExecutorFactory.h  |     3 +
 runtime/onert/core/src/compiler/HEScheduler.h      |    10 +-
 .../onert/core/src/compiler/OperationValidator.cc  |   161 +-
 .../onert/core/src/compiler/OperationValidator.h   |     4 +-
 .../core/src/compiler/StaticShapeInference.cc      |     5 +
 runtime/onert/core/src/compiler/TensorBuilders.h   |    12 +
 .../onert/core/src/exec/DynamicShapeInference.cc   |     5 +
 runtime/onert/core/src/exec/ExecutorBase.cc        |     4 +-
 runtime/onert/core/src/interp/operations/Pad.cc    |     4 +-
 runtime/onert/core/src/ir/LoweredGraph.cc          |     3 -
 runtime/onert/core/src/ir/operation/Quantize.cc    |    37 +
 .../core/src/ir/pass/PermutationEliminationPass.cc |   195 -
 .../core/src/ir/pass/PermutationEliminationPass.h  |    86 -
 .../core/src/ir/pass/PermutationInsertionPass.cc   |    15 +-
 .../frontend/base_loader/include/base_loader.h     |    36 +
 .../frontend/nnapi/wrapper/OperationFactory.cc     |   337 +-
 runtime/onert/test/core/exec/ExecInstance.cc       |    94 +-
 tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_cl  |    18 +-
 .../nnapi/nnapi_gtest.skip.aarch64-linux.acl_neon  |    19 +-
 tests/nnapi/nnapi_gtest.skip.aarch64-linux.cpu     |    13 +-
 tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_cl   |    18 +-
 tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon |    19 +-
 tests/nnapi/nnapi_gtest.skip.armv7l-linux.cpu      |    13 +-
 tests/nnapi/nnapi_gtest.skip.noarch.interp         |    16 +
 tests/nnapi/nnapi_gtest.skip.x86_64-linux.cpu      |    13 +-
 .../specs/V1_0/l2_normalization_quant8_nnfw.mod.py |    30 +
 .../specs/{skip => }/V1_2/pad_v2_1_float.mod.py    |     0
 .../specs/{skip => }/V1_2/pad_v2_1_quant8.mod.py   |     0
 .../specs/{skip => }/V1_2/pad_v2_all_dims.mod.py   |     0
 .../{skip => }/V1_2/pad_v2_all_dims_quant8.mod.py  |     0
 .../specs/{skip => }/V1_2/pad_v2_low_rank.mod.py   |     0
 .../{skip => }/V1_2/pad_v2_low_rank_quant8.mod.py  |     0
 tests/nnapi/specs/{skip => }/V1_2/quantize.mod.py  |     0
 tests/nnfw_api/src/ValidationTestAddModelLoaded.cc |    19 +-
 .../src/ValidationTestAddSessionPrepared.cc        |     6 +-
 tests/nnfw_api/src/ValidationTestSessionCreated.cc |    28 +-
 tests/scripts/benchmark_nnapi.sh                   |    23 +-
 tests/scripts/common.sh                            |    11 +-
 tests/scripts/framework/run_test.sh                |    60 +-
 tests/scripts/test-driver.sh                       |    17 -
 tests/scripts/test_framework.sh                    |    10 +-
 tests/tools/nnpackage_run/CMakeLists.txt           |     2 +-
 tests/tools/nnpackage_run/src/args.cc              |   246 +-
 tests/tools/nnpackage_run/src/h5formatter.cc       |     8 +-
 tests/tools/tflite_loader/CMakeLists.txt           |     2 +-
 tests/tools/tflite_run/CMakeLists.txt              |     2 +-
 .../nncc-tc-to-nnpkg-tc/nncc-tc-to-nnpkg-tc.sh     |     5 +-
 tools/tflitefile_tool/select_operator.py           |    21 +-
 tools/tflkit/README.md                             |    12 +-
 tools/update_version/update-version                |    11 +-
 386 files changed, 38093 insertions(+), 13423 deletions(-)
 create mode 100644 .ahub/tcchecker-tca/config.yaml
 create mode 100644 compiler/.ahub/tcchecker-tca/config.yaml
 create mode 100644 compiler/bcq-tools/CMakeLists.txt
 create mode 100644 compiler/bcq-tools/README.md
 create mode 100644 compiler/bcq-tools/generate_bcq_output_arrays
 create mode 100644 compiler/bcq-tools/preserve_bcq_info
 create mode 100644 compiler/luci-interpreter/src/kernels/DepthToSpace.cpp
 create mode 100644 compiler/luci-interpreter/src/kernels/DepthToSpace.h
 create mode 100644 compiler/luci-interpreter/src/kernels/DepthToSpace.test.cpp
 create mode 100644 compiler/luci-interpreter/src/kernels/Reverse.cpp
 create mode 100644 compiler/luci-interpreter/src/kernels/Reverse.h
 create mode 100644 compiler/luci-interpreter/src/kernels/Reverse.test.cpp
 create mode 100644 compiler/luci-interpreter/src/kernels/Slice.cpp
 create mode 100644 compiler/luci-interpreter/src/kernels/Slice.h
 create mode 100644 compiler/luci-interpreter/src/kernels/Slice.test.cpp
 create mode 100644 compiler/luci-interpreter/src/loader/KernelBuilder.test.cpp
 create mode 100644 compiler/vconone/CMakeLists.txt
 create mode 100644 compiler/vconone/README.md
 create mode 100644 compiler/vconone/driver/driver.cpp
 create mode 100644 compiler/vconone/include/vconone/vconone.h
 create mode 100644 compiler/vconone/src/version.cpp
 create mode 100644 compiler/vconone/src/version.test.cpp
 create mode 100644 compiler/vconone/version_cfg.h.in
 delete mode 100644 compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgOperationKernel.h
 delete mode 100644 compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h
 delete mode 100644 compute/ARMComputeEx/arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h
 delete mode 100644 compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.h
 delete mode 100644 compute/ARMComputeEx/arm_compute/core/CL/kernels/CLPReLUKernel.h
 delete mode 100644 compute/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h
 delete mode 100644 compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h
 delete mode 100644 compute/ARMComputeEx/arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h
 delete mode 100644 compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastKernel.h
 delete mode 100644 compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h
 delete mode 100644 compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h
 delete mode 100644 compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEPReLUKernel.h
 delete mode 100644 compute/ARMComputeEx/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h
 delete mode 100644 compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgOperation.h
 delete mode 100644 compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBatchToSpaceND.h
 delete mode 100644 compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h
 delete mode 100644 compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDepthToSpace.h
 create mode 100644 compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h
 delete mode 100644 compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.h
 delete mode 100644 compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLLogicalNot.h
 delete mode 100644 compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPReLU.h
 delete mode 100644 compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h
 delete mode 100644 compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLRNNLayerEx.h
 delete mode 100644 compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h
 delete mode 100644 compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSliceEx.h
 delete mode 100644 compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h
 delete mode 100644 compute/ARMComputeEx/arm_compute/runtime/CPP/functions/CPPUpsampleEx.h
 delete mode 100644 compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECast.h
 delete mode 100644 compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h
 delete mode 100644 compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayerEx.h
 delete mode 100644 compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h
 delete mode 100644 compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEPReLU.h
 delete mode 100644 compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NERNNLayerEx.h
 delete mode 100644 compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceMeanEx.h
 delete mode 100644 compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h
 delete mode 100644 compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h
 delete mode 100644 compute/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl
 delete mode 100644 compute/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl
 delete mode 100644 compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl
 delete mode 100644 compute/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl
 delete mode 100644 compute/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl
 delete mode 100644 compute/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl
 delete mode 100644 compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl
 delete mode 100644 compute/ARMComputeEx/src/core/CL/kernels/CLArgOperationKernel.cpp
 delete mode 100644 compute/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp
 delete mode 100644 compute/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp
 delete mode 100644 compute/ARMComputeEx/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.cpp
 delete mode 100644 compute/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp
 delete mode 100644 compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp
 delete mode 100644 compute/ARMComputeEx/src/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.cpp
 delete mode 100644 compute/ARMComputeEx/src/core/CPP/kernels/CPPUpsampleKernelEx.cpp
 delete mode 100644 compute/ARMComputeEx/src/core/NEON/kernels/NECastKernel.cpp
 delete mode 100644 compute/ARMComputeEx/src/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.cpp
 delete mode 100644 compute/ARMComputeEx/src/core/NEON/kernels/NEElementwiseUnaryKernelEx.cpp
 delete mode 100644 compute/ARMComputeEx/src/core/NEON/kernels/NEPReLUKernel.cpp
 delete mode 100644 compute/ARMComputeEx/src/core/NEON/kernels/NESpaceToDepthLayerKernelEx.cpp
 delete mode 100644 compute/ARMComputeEx/src/runtime/CL/functions/CLArgOperation.cpp
 delete mode 100644 compute/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp
 delete mode 100644 compute/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp
 create mode 100644 compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp
 delete mode 100644 compute/ARMComputeEx/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.cpp
 delete mode 100644 compute/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp
 delete mode 100644 compute/ARMComputeEx/src/runtime/CL/functions/CLRNNLayerEx.cpp
 delete mode 100644 compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp
 delete mode 100644 compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayerUpsample.cpp
 delete mode 100644 compute/ARMComputeEx/src/runtime/CPP/functions/CPPUpsampleEx.cpp
 delete mode 100644 compute/ARMComputeEx/src/runtime/NEON/functions/NECast.cpp
 delete mode 100644 compute/ARMComputeEx/src/runtime/NEON/functions/NEDepthToSpaceLayerEx.cpp
 delete mode 100644 compute/ARMComputeEx/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.cpp
 delete mode 100644 compute/ARMComputeEx/src/runtime/NEON/functions/NEPReLU.cpp
 delete mode 100644 compute/ARMComputeEx/src/runtime/NEON/functions/NERNNLayerEx.cpp
 delete mode 100644 compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceMeanEx.cpp
 delete mode 100644 compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToBatchLayerEx.cpp
 delete mode 100644 compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToDepthLayerEx.cpp
 create mode 100644 compute/cker/include/cker/operation/L2Normalize.h
 create mode 100644 compute/cker/include/cker/operation/Quantize.h
 create mode 100644 compute/cker/include/cker/operation/SpaceToDepth.h
 create mode 100644 infra/cmake/packages/Pybind11Config.cmake
 create mode 100644 infra/cmake/packages/Pybind11SourceConfig.cmake
 create mode 100644 infra/scripts/build-tcm.sh
 create mode 100644 master_diff_1.7.0.patch
 create mode 100644 res/TensorFlowLiteRecipes/AveragePool2D_U8_000/test.recipe
 create mode 100644 res/TensorFlowLiteRecipes/AveragePool2D_U8_000/test.reverse
 create mode 100644 res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.recipe
 create mode 100644 res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.reverse
 create mode 100644 res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.rule
 create mode 100644 res/TensorFlowLiteRecipes/DepthwiseConv2D_U8_001/test.recipe
 create mode 100644 res/TensorFlowLiteRecipes/DepthwiseConv2D_U8_001/test.reverse
 create mode 100644 res/TensorFlowLiteRecipes/L2Normalize_U8_000/test.recipe
 create mode 100644 res/TensorFlowLiteRecipes/L2Normalize_U8_000/test.reverse
 create mode 100644 res/TensorFlowLiteRecipes/Logistic_U8_000/test.recipe
 create mode 100644 res/TensorFlowLiteRecipes/Logistic_U8_000/test.reverse
 create mode 100644 res/TensorFlowLiteRecipes/Unique_000/test.recipe
 create mode 100644 res/TensorFlowLiteRecipes/Unique_000/test.reverse
 create mode 100644 res/TensorFlowLiteRecipes/Unique_001/test.recipe
 create mode 100644 res/TensorFlowLiteRecipes/Unique_001/test.reverse
 create mode 100644 res/TensorFlowLiteRecipes/Unique_002/test.recipe
 create mode 100644 res/TensorFlowLiteRecipes/Unique_002/test.reverse
 create mode 100644 res/TensorFlowLiteRecipes/Unique_003/test.recipe
 create mode 100644 res/TensorFlowLiteRecipes/Unique_003/test.reverse
 create mode 100644 res/TensorFlowLiteRecipes/Unique_U8_000/test.recipe
 create mode 100644 res/TensorFlowLiteRecipes/Unique_U8_000/test.reverse
 create mode 100644 res/TensorFlowLiteRecipes/Unique_U8_001/test.recipe
 create mode 100644 res/TensorFlowLiteRecipes/Unique_U8_001/test.reverse
 create mode 100644 runtime/onert/backend/acl_common/AclKernelGen.h
 create mode 100644 runtime/onert/backend/cpu/StaticTensorManager.cc
 create mode 100644 runtime/onert/backend/cpu/StaticTensorManager.h
 create mode 100644 runtime/onert/backend/cpu/ops/L2NormLayer.cc
 create mode 100644 runtime/onert/backend/cpu/ops/L2NormLayer.h
 create mode 100644 runtime/onert/backend/cpu/ops/QuantizeLayer.cc
 create mode 100644 runtime/onert/backend/cpu/ops/QuantizeLayer.h
 create mode 100644 runtime/onert/backend/cpu/ops/SpaceToDepthLayer.cc
 create mode 100644 runtime/onert/backend/cpu/ops/SpaceToDepthLayer.h
 create mode 100644 runtime/onert/core/include/ir/operation/Quantize.h
 create mode 100644 runtime/onert/core/src/ir/operation/Quantize.cc
 delete mode 100644 runtime/onert/core/src/ir/pass/PermutationEliminationPass.cc
 delete mode 100644 runtime/onert/core/src/ir/pass/PermutationEliminationPass.h
 create mode 100644 tests/nnapi/specs/V1_0/l2_normalization_quant8_nnfw.mod.py
 rename tests/nnapi/specs/{skip => }/V1_2/pad_v2_1_float.mod.py (100%)
 rename tests/nnapi/specs/{skip => }/V1_2/pad_v2_1_quant8.mod.py (100%)
 rename tests/nnapi/specs/{skip => }/V1_2/pad_v2_all_dims.mod.py (100%)
 rename tests/nnapi/specs/{skip => }/V1_2/pad_v2_all_dims_quant8.mod.py (100%)
 rename tests/nnapi/specs/{skip => }/V1_2/pad_v2_low_rank.mod.py (100%)
 rename tests/nnapi/specs/{skip => }/V1_2/pad_v2_low_rank_quant8.mod.py (100%)
 rename tests/nnapi/specs/{skip => }/V1_2/quantize.mod.py (100%)

diff --git a/.ahub/tcchecker-tca/config.yaml b/.ahub/tcchecker-tca/config.yaml
new file mode 100644
index 0000000..cd34d79
--- /dev/null
+++ b/.ahub/tcchecker-tca/config.yaml
@@ -0,0 +1,43 @@
+version: 2
+test:
+  - name: NN Runtime
+    testCaseLanguage: CPP
+    testFW: GTEST
+    testCaseFolder:
+      - ./compute/test/cker
+      - ./runtime/onert/core/src/backend/cpu_common
+      - ./runtime/onert/frontend/nnapi
+      - ./runtime/onert/test/core/compiler
+      - ./runtime/onert/test/core/exec
+      - ./runtime/onert/test/core/interp
+      - ./runtime/onert/test/graph
+      - ./runtime/onert/test/graph/operand
+      - ./runtime/onert/test/graph/operation
+      - ./runtime/onert/test/graph/verifier
+      - ./runtime/onert/test/ir
+      - ./runtime/onert/test/util
+      - ./tests/nnapi/src
+      - ./tests/nnfw_api/src
+      - ./tests/tools/tflite_run/src
+
+    testFile:
+      - extension: cpp
+        any: true
+      - extension: cc
+        any: true
+
+    testCase:
+      - condition:
+        - functionName:
+            starts:
+              - TEST
+ 
+    negativeTestCase:
+      - condition:
+        - testName:
+            starts:
+              - neg_
+
+    positiveTestCase:
+      - condition:
+        - inverse: negativeTestCase
diff --git a/compiler/.ahub/tcchecker-tca/config.yaml b/compiler/.ahub/tcchecker-tca/config.yaml
new file mode 100644
index 0000000..ef681de
--- /dev/null
+++ b/compiler/.ahub/tcchecker-tca/config.yaml
@@ -0,0 +1,54 @@
+version: 2
+test:
+  - name: NN Compiler
+    testCaseLanguage: CPP
+    testFW: GTEST
+    testCaseFolder:
+      - ./angkor
+      - ./arser
+      - ./circle2circle
+      - ./circle-quantizer
+      - ./cwrap
+      - ./foder
+      - ./hermes
+      - ./hermes-std
+      - ./loco
+      - ./locomotiv
+      - ./locop
+      - ./logo
+      - ./logo-core
+      - ./luci
+      - ./luci-interpreter
+      - ./luci-value-test
+      - ./mio-circle
+      - ./mio-tflite
+      - ./oops
+      - ./pepper-assert
+      - ./pepper-str
+      - ./pepper-strcast
+      - ./pp
+      - ./record-minmax
+      - ./safemain
+      - ./souschef
+      - ./stdex
+      - ./tflite2circle
+
+    testFile:
+      - extension: .test.cpp
+        any: true
+
+    testCase:
+      - condition:
+        - functionName:
+            starts:
+              - TEST
+
+    negativeTestCase:
+      - condition:
+        - testName:
+            ends:
+              - _NEG
+
+    positiveTestCase:
+      - condition:
+        - inverse: negativeTestCase
diff --git a/compiler/bcq-tools/CMakeLists.txt b/compiler/bcq-tools/CMakeLists.txt
new file mode 100644
index 0000000..ae231bd
--- /dev/null
+++ b/compiler/bcq-tools/CMakeLists.txt
@@ -0,0 +1,27 @@
+set(BCQ_TOOLS_FILES
+    generate_bcq_output_arrays
+    preserve_bcq_info
+)
+
+foreach(BCQ_TOOLS IN ITEMS ${BCQ_TOOLS_FILES})
+
+  set(BCQ_TOOLS_FILE ${BCQ_TOOLS})
+  set(BCQ_TOOLS_SRC "${CMAKE_CURRENT_SOURCE_DIR}/${BCQ_TOOLS_FILE}")
+  set(BCQ_TOOLS_BIN "${CMAKE_CURRENT_BINARY_DIR}/${BCQ_TOOLS_FILE}")
+  set(BCQ_TOOLS_TARGET "${BCQ_TOOLS}_target")
+
+  add_custom_command(OUTPUT ${BCQ_TOOLS_BIN}
+    COMMAND ${CMAKE_COMMAND} -E copy "${BCQ_TOOLS_SRC}" "${BCQ_TOOLS_BIN}"
+    DEPENDS ${BCQ_TOOLS_SRC}
+    COMMENT "Generate ${BCQ_TOOLS_BIN}"
+  )
+
+  add_custom_target(${BCQ_TOOLS_TARGET} ALL DEPENDS ${BCQ_TOOLS_BIN})
+
+  install(FILES ${BCQ_TOOLS_BIN}
+          PERMISSIONS OWNER_WRITE OWNER_READ OWNER_EXECUTE
+                      GROUP_READ GROUP_WRITE GROUP_EXECUTE
+                      WORLD_READ WORLD_EXECUTE
+          DESTINATION bin)
+
+endforeach(BCQ_TOOLS)
diff --git a/compiler/bcq-tools/README.md b/compiler/bcq-tools/README.md
new file mode 100644
index 0000000..18b0f48
--- /dev/null
+++ b/compiler/bcq-tools/README.md
@@ -0,0 +1,78 @@
+# BCQ Tools
+
+This directory includes some tools related with BCQ.
+
+## preserve_bcq_info
+
+### Purpose
+
+`preserve_bcq_info` is for preserving constant nodes which include BCQ information.
+When `.pb` file is converted to `.tflite` file by TFlite converter, constant nodes whose values are exactly same are removed and then linked to only one representative node.
+This makes us impossible to know what constant node should be linked to a node which we want to apply BCQ.
+One of the solutions is making all the same constant nodes different by inserting unique values and ignore the newly generated unique values when BCQ fusing is applied.
+`preserve_bcq_info` will generate and insert unique dummy values to the constant nodes whose values are same not to be removed by Tensorflow Lite converter.
+As a result, BCQ information will be preserved.
+
+### How to use
+
+```bash
+preserve_bcq_info \
+--input_path /path/to/original_model.pb \
+--output_path /path/to/preserved_model.pb
+```
+
+### How it works
+
+If we add unique dummy value at the end of each constant nodes, all the constant nodes would be different. Following is an example.
+
+```
+[Original Constant Nodes]
+const(value=[1, 2, 3], name='const1')
+const(value=[1, 2, 3], name='const2')
+const(value=[1, 2, 3], name='const3')
+
+[After BCQ information preserved]
+const(value=[1, 2, 3, -1], name='const1')
+const(value=[1, 2, 3, -2], name='const2')
+const(value=[1, 2, 3, -3], name='const3')
+```
+
+For dummy values, negative values are used instead of positive values.
+This is because positive valus may be confused with original constant node values.
+For your information, unique dummy value starts from -1 and moves to -2, -3, ..., -N, where N is the number of preserved constant nodes.
+
+### Caution
+
+- Newly generated dummy values should be ignored when the constant nodes are used.
+
+## generate_bcq_output_arrays
+
+### Purpose
+
+To apply BCQ, BCQ information nodes should be designated as model output so that they are alive even after TFLite conversion is finished.
+However, there are so many nodes to designate and sometimes we cannot copy and paste all of them because the string size is too big.
+`generate_bcq_output_arrays` is for generating output_arrays, which include BCQ information nodes.
+
+### How to use
+
+```bash
+generate_bcq_output_arrays \
+--input_path /path/to/original_model.pb \
+--output_path /path/to/output_arrays.txt
+```
+
+### How it works
+
+```
+[Original BCQ information nodes]
+const(value=[1, 2, 3, -1], name='const1')
+const(value=[1, 2, 3, -2], name='const2')
+const(value=[1, 2, 3, -3], name='const3')
+
+[Generated output_arrays]
+,const1,const2,const3
+```
+
+### Caution
+
+- Generated output_arrays will be start with comma.
diff --git a/compiler/bcq-tools/generate_bcq_output_arrays b/compiler/bcq-tools/generate_bcq_output_arrays
new file mode 100644
index 0000000..48e8a93
--- /dev/null
+++ b/compiler/bcq-tools/generate_bcq_output_arrays
@@ -0,0 +1,90 @@
+#!/usr/bin/env python3
+
+import tensorflow as tf
+
+import argparse
+import sys
+
+
+def _get_parser():
+    """
+    Returns an ArgumentParser for generating output_arrays.
+    """
+    parser = argparse.ArgumentParser(
+        description=("Command line tool to generated output_arrays of BCQ nodes"))
+
+    # Input and output path.
+    parser.add_argument(
+        "-i",
+        "--input_path",
+        type=str,
+        help="Full filepath of the input file.",
+        required=True)
+    parser.add_argument(
+        "-o",
+        "--output_path",
+        type=str,
+        help="Full filepath of the output file.",
+        required=True)
+
+    return parser
+
+
+def load_graph(frozen_graph_filename):
+    """
+    Load graph from frozen pb file
+    """
+    with tf.compat.v1.gfile.GFile(frozen_graph_filename, "rb") as f:
+        graph_def = tf.compat.v1.GraphDef()
+        graph_def.ParseFromString(f.read())
+    with tf.Graph().as_default() as graph:
+        tf.import_graph_def(graph_def, name='')
+    return graph
+
+
+def dtype2str(dtype):
+    if dtype == "int32":
+        return "TF_INT32"
+    elif dtype == "int64":
+        return "TF_INT64"
+    elif dtype == "float32":
+        return "TF_FLOAT"
+    elif dtype == "bool":
+        return "TF_BOOL"
+    else:
+        raise Exception("Not supported dtype")
+
+
+def print_output_arrays(flags):
+    graph_model = load_graph(flags.input_path)
+    graph_model_def = graph_model.as_graph_def()
+    ops = graph_model.get_operations()
+
+    output_names = [op.outputs[0].name for op in ops 
+        if op.type == "Const" and "bcqinfo_" in op.outputs[0].name]
+
+    output_arrays = ""    
+    for output_name in output_names:
+        output_arrays += ","
+
+        colon_index = output_name.find(":")
+        if colon_index == -1:
+            output_arrays += output_name
+        else:
+            output_arrays += output_name[:colon_index]
+
+    f = open(flags.output_path, 'w')
+    f.write(output_arrays)
+    f.close()
+
+
+def main():
+    # Parse argument.
+    parser = _get_parser()
+    flags = parser.parse_known_args(args=sys.argv[1:])
+
+    print_output_arrays(flags[0])
+
+
+if __name__ == "__main__":
+    main()
diff --git a/compiler/bcq-tools/preserve_bcq_info b/compiler/bcq-tools/preserve_bcq_info
new file mode 100644
index 0000000..2ede8d4
--- /dev/null
+++ b/compiler/bcq-tools/preserve_bcq_info
@@ -0,0 +1,116 @@
+#!/usr/bin/env python3
+
+import tensorflow as tf
+import numpy as np
+
+import argparse
+import sys
+
+
+def _get_parser():
+    """
+    Returns an ArgumentParser for preserving BCQ information.
+    """
+    parser = argparse.ArgumentParser(
+        description=("Command line tool to preserve BCQ information"))
+
+    # Input and output path.
+    parser.add_argument(
+        "-i",
+        "--input_path",
+        type=str,
+        help="Full filepath of the input file.",
+        required=True)
+    parser.add_argument(
+        "-o",
+        "--output_path",
+        type=str,
+        help="Full filepath of the output file.",
+        required=True)
+
+    return parser
+
+
+def load_graph(frozen_graph_filename):
+    """
+    Load graph from frozen pb file
+    """
+    with tf.compat.v1.gfile.GFile(frozen_graph_filename, "rb") as f:
+        graph_def = tf.compat.v1.GraphDef()
+        graph_def.ParseFromString(f.read())
+    with tf.Graph().as_default() as graph:
+        tf.import_graph_def(graph_def, name='')
+    return graph
+
+
+def preserve_bcq_info(flags):
+    """
+    Generate unique dummy value from -1 to -N.
+
+    We use negative values to preserve BCQ information because
+    positive values may cause some confusion with real BCQ information values.
+    """
+
+    class UniqueValueGen:
+        def __init__(self):
+            self.unique_value = -1
+
+        def gen(self):
+            val = self.unique_value
+            self.unique_value = val - 1
+            return val
+
+    unique_value = UniqueValueGen()
+
+    original_graph_model = load_graph(flags.input_path)
+    original_graph_model_def = original_graph_model.as_graph_def()
+
+    new_graph = tf.compat.v1.GraphDef()
+    substitution_dict = {}
+
+    DT_INT32 = None  # Just for copying DT_INT32 attribute value
+
+    for node in original_graph_model_def.node:
+        if node.op == "Const":
+            # Because bcqinfo_do_w_x is BOOL type, we cannot add dummy value at the end.
+            # Therefore we should convert the type to INT32 type.
+            if "/bcqinfo_do_w_x" in node.name:
+                original_tensor = tf.make_ndarray(node.attr["value"].tensor)
+                substitution_dict[node.name] = tf.make_tensor_proto(
+                    [int(original_tensor[0]), unique_value.gen()], tf.int32)
+
+            preserved_bcqinfo_list = ["/bcqinfo_number_of_clusters", "/bcqinfo_size_of_clusters", 
+                "/bcqinfo_qbits_of_clusters"]
+
+            if any(name in node.name for name in preserved_bcqinfo_list):
+                original_tensor = tf.make_ndarray(
+                    node.attr["value"].tensor)  # variable name change
+                substitution_dict[node.name] = tf.make_tensor_proto(
+                    np.append(original_tensor, unique_value.gen()), tf.int32)
+                DT_INT32 = node.attr["dtype"]
+
+    for node in original_graph_model_def.node:
+        if node.name in substitution_dict:
+            new_node = new_graph.node.add()
+            new_node.op = "Const"
+            new_node.name = node.name
+            new_node.attr["dtype"].CopyFrom(DT_INT32)
+            new_node.attr["value"].tensor.CopyFrom(substitution_dict[node.name])
+        else:
+            new_node = new_graph.node.add()
+            new_node.CopyFrom(node)
+
+    tf.io.write_graph(new_graph, '.', flags.output_path, False)
+
+
+def main():
+    # Parse argument.
+    parser = _get_parser()
+    flags = parser.parse_known_args(args=sys.argv[1:])
+
+    # Generate a new pb file, which BCQ information is preserved.
+    preserve_bcq_info(flags[0])
+
+
+if __name__ == "__main__":
+    main()
diff --git a/compiler/circle-quantizer/CMakeLists.txt b/compiler/circle-quantizer/CMakeLists.txt
index 1335057..009bfab 100644
--- a/compiler/circle-quantizer/CMakeLists.txt
+++ b/compiler/circle-quantizer/CMakeLists.txt
@@ -13,5 +13,6 @@ target_link_libraries(circle-quantizer luci_service)
 target_link_libraries(circle-quantizer luci_pass)
 target_link_libraries(circle-quantizer luci_export)
 target_link_libraries(circle-quantizer arser)
+target_link_libraries(circle-quantizer vconone)
 
 install(TARGETS circle-quantizer DESTINATION bin)
diff --git a/compiler/circle-quantizer/requires.cmake b/compiler/circle-quantizer/requires.cmake
index 2293e53..c21e28e 100644
--- a/compiler/circle-quantizer/requires.cmake
+++ b/compiler/circle-quantizer/requires.cmake
@@ -5,3 +5,4 @@ require("safemain")
 require("luci")
 require("oops")
 require("arser")
+require("vconone")
diff --git a/compiler/circle-quantizer/src/CircleQuantizer.cpp b/compiler/circle-quantizer/src/CircleQuantizer.cpp
index b56b547..8d3a80c 100644
--- a/compiler/circle-quantizer/src/CircleQuantizer.cpp
+++ b/compiler/circle-quantizer/src/CircleQuantizer.cpp
@@ -25,6 +25,7 @@
 
 #include <oops/InternalExn.h>
 #include <arser/arser.h>
+#include <vconone/vconone.h>
 
 #include <functional>
 #include <iostream>
@@ -36,6 +37,12 @@ using OptionHook = std::function<int(const char **)>;
 using Algorithms = luci::CircleOptimizer::Options::Algorithm;
 using AlgorithmParameters = luci::CircleOptimizer::Options::AlgorithmParameters;
 
+void print_version(void)
+{
+  std::cout << "circle-quantizer version " << vconone::get_string() << std::endl;
+  std::cout << vconone::get_copyright() << std::endl;
+}
+
 int entry(int argc, char **argv)
 {
   // Simple argument parser (based on map)
@@ -49,13 +56,20 @@ int entry(int argc, char **argv)
 
   arser::Arser arser("circle-quantizer provides circle model quantization");
 
+  arser.add_argument("--version")
+      .nargs(0)
+      .required(false)
+      .default_value(false)
+      .help("Show version information and exit")
+      .exit_with(print_version);
+
   arser.add_argument(qdqw)
       .nargs(3)
       .type(arser::DataType::STR_VEC)
       .required(false)
       .help("Quantize-dequantize weight values required action before quantization. "
             "Three arguments required: input_dtype(float32) "
-            "output_dtype(uint8) granularity(layer)");
+            "output_dtype(uint8) granularity(layer, channel)");
 
   arser.add_argument(qwmm)
       .nargs(3)
@@ -63,7 +77,7 @@ int entry(int argc, char **argv)
       .required(false)
       .help("Quantize with min/max values. "
             "Three arguments required: input_dtype(float32) "
-            "output_dtype(uint8) granularity(layer)");
+            "output_dtype(uint8) granularity(layer, channel)");
 
   arser.add_argument("input").nargs(1).type(arser::DataType::STR).help("Input circle model");
   arser.add_argument("output").nargs(1).type(arser::DataType::STR).help("Output circle model");
diff --git a/compiler/circle-tensordump/driver/Driver.cpp b/compiler/circle-tensordump/driver/Driver.cpp
index a55cd45..38e3073 100644
--- a/compiler/circle-tensordump/driver/Driver.cpp
+++ b/compiler/circle-tensordump/driver/Driver.cpp
@@ -46,7 +46,7 @@ int entry(int argc, char **argv)
   {
     std::cout << err.what() << std::endl;
     std::cout << arser;
-    return 0;
+    return 255;
   }
 
   std::unique_ptr<circletensordump::DumpInterface> dump;
diff --git a/compiler/circle-tensordump/src/Dump.cpp b/compiler/circle-tensordump/src/Dump.cpp
index dfa78f0..a8d3256 100644
--- a/compiler/circle-tensordump/src/Dump.cpp
+++ b/compiler/circle-tensordump/src/Dump.cpp
@@ -136,6 +136,7 @@ void DumpTensors::run(std::ostream &os, const circle::Model *model, const std::s
         auto max = quant_param->max();
         auto scale = quant_param->scale();
         auto zero_point = quant_param->zero_point();
+        auto quantized_dimension = quant_param->quantized_dimension();
 
         os << " " + print_format2 + "Â Â  âââ min        : ";
         ::print_comma_sepearted(os, min);
@@ -146,9 +147,11 @@ void DumpTensors::run(std::ostream &os, const circle::Model *model, const std::s
         os << " " + print_format2 + "Â Â  âââ scale      : ";
         ::print_comma_sepearted(os, scale);
         os << std::endl;
-        os << " " + print_format2 + "Â Â  âââ zero_point : ";
+        os << " " + print_format2 + "Â Â  âââ zero_point : ";
         ::print_comma_sepearted(os, zero_point);
         os << std::endl;
+        os << " " + print_format2 + "Â Â  âââ quantized_dimension : " << quantized_dimension;
+        os << std::endl;
       }
 
       // buffer
@@ -229,7 +232,7 @@ std::vector<hsize_t> hdf5_dims_cast(const flatbuffers::Vector<T> *data,
 }
 
 /**
- *  This function writes data to given hdf5 file like below.
+ *  This function writes vector data to given hdf5 file like below.
  *
  *  GROUP "group_name"
  *   ã´DATATYPE "type"
@@ -238,9 +241,9 @@ std::vector<hsize_t> hdf5_dims_cast(const flatbuffers::Vector<T> *data,
  *   ã´DATA "data"
  */
 template <typename T>
-void write_data_to_hdf5(H5::H5File &file, std::string &group_name, std::string dataset_name,
-                        const H5::PredType &type, const flatbuffers::Vector<T> *data,
-                        std::vector<hsize_t> dims)
+void write_vector_data_to_hdf5(H5::H5File &file, std::string &group_name, std::string dataset_name,
+                               const H5::PredType &type, const flatbuffers::Vector<T> *data,
+                               std::vector<hsize_t> dims)
 {
   if (data == nullptr)
     return;
@@ -250,6 +253,17 @@ void write_data_to_hdf5(H5::H5File &file, std::string &group_name, std::string d
   dataset->write(data->data(), type);
 }
 
+/// @brief This function writes scalar data to given hdf5 file
+template <typename T>
+void write_scalar_data_to_hdf5(H5::H5File &file, std::string &group_name, std::string dataset_name,
+                               const H5::PredType &type, T data)
+{
+  auto dataspace = std::make_unique<H5::DataSpace>(H5S_SCALAR);
+  auto dataset = std::make_unique<H5::DataSet>(
+      file.createDataSet(group_name + "/" + dataset_name, type, *dataspace));
+  dataset->write(&data, type);
+}
+
 } // namespace
 
 namespace circletensordump
@@ -297,8 +311,9 @@ void DumpTensorsToHdf5::run(std::ostream &os, const circle::Model *model,
       auto buff_data_ptr = reader.buffers()->Get(buff_idx)->data();
       if (buff_data_ptr)
       {
-        ::write_data_to_hdf5(file, group_name, "weights", ::hdf5_dtype_cast(tensor->type()),
-                             buff_data_ptr, ::hdf5_dims_cast(buff_data_ptr, tensor->shape()));
+        ::write_vector_data_to_hdf5(file, group_name, "weights", ::hdf5_dtype_cast(tensor->type()),
+                                    buff_data_ptr,
+                                    ::hdf5_dims_cast(buff_data_ptr, tensor->shape()));
       }
 
       // write quantization parameters
@@ -306,17 +321,20 @@ void DumpTensorsToHdf5::run(std::ostream &os, const circle::Model *model,
       if (quant_param)
       {
         auto min = quant_param->min();
-        ::write_data_to_hdf5(file, group_name, "min", H5::PredType::NATIVE_FLOAT, min,
-                             ::hdf5_dims_cast(min));
+        ::write_vector_data_to_hdf5(file, group_name, "min", H5::PredType::NATIVE_FLOAT, min,
+                                    ::hdf5_dims_cast(min));
         auto max = quant_param->max();
-        ::write_data_to_hdf5(file, group_name, "max", H5::PredType::NATIVE_FLOAT, max,
-                             ::hdf5_dims_cast(max));
+        ::write_vector_data_to_hdf5(file, group_name, "max", H5::PredType::NATIVE_FLOAT, max,
+                                    ::hdf5_dims_cast(max));
         auto scale = quant_param->scale();
-        ::write_data_to_hdf5(file, group_name, "scale", H5::PredType::NATIVE_FLOAT, scale,
-                             ::hdf5_dims_cast(scale));
+        ::write_vector_data_to_hdf5(file, group_name, "scale", H5::PredType::NATIVE_FLOAT, scale,
+                                    ::hdf5_dims_cast(scale));
         auto zero_point = quant_param->zero_point();
-        ::write_data_to_hdf5(file, group_name, "zero_point", H5::PredType::NATIVE_INT64, zero_point,
-                             ::hdf5_dims_cast(zero_point));
+        ::write_vector_data_to_hdf5(file, group_name, "zero_point", H5::PredType::NATIVE_INT64,
+                                    zero_point, ::hdf5_dims_cast(zero_point));
+        auto quantized_dimension = quant_param->quantized_dimension();
+        ::write_scalar_data_to_hdf5(file, group_name, "quantized_dimension",
+                                    H5::PredType::NATIVE_INT32, quantized_dimension);
       }
     }
   }
diff --git a/compiler/circle-verify/src/Driver.cpp b/compiler/circle-verify/src/Driver.cpp
index 1af31d9..7a44c65 100644
--- a/compiler/circle-verify/src/Driver.cpp
+++ b/compiler/circle-verify/src/Driver.cpp
@@ -35,7 +35,7 @@ int entry(int argc, char **argv)
   {
     std::cout << err.what() << std::endl;
     std::cout << arser;
-    return 0;
+    return 255;
   }
 
   auto verifier = std::make_unique<VerifyFlatbuffers>();
diff --git a/compiler/circle2circle-dredd-recipe-test/CMakeLists.txt b/compiler/circle2circle-dredd-recipe-test/CMakeLists.txt
index 6663cb9..4bcaae3 100644
--- a/compiler/circle2circle-dredd-recipe-test/CMakeLists.txt
+++ b/compiler/circle2circle-dredd-recipe-test/CMakeLists.txt
@@ -1,25 +1,12 @@
 nnas_include(TargetRequire)
 
 unset(REQUIRED_TARGETS)
-list(APPEND REQUIRED_TARGETS circlechef)
 list(APPEND REQUIRED_TARGETS circle-inspect)
 list(APPEND REQUIRED_TARGETS circle-verify)
 list(APPEND REQUIRED_TARGETS circle2circle)
 list(APPEND REQUIRED_TARGETS dredd_rule_lib)
-list(APPEND REQUIRED_TARGETS tflchef)
-list(APPEND REQUIRED_TARGETS tflite2circle)
 TargetRequire_Return(${REQUIRED_TARGETS})
 
-nncc_find_resource(TensorFlowLiteRecipes)
-nncc_find_resource(CircleRecipes)
-
-set(TFLITE_RECIPE_REPO "${TensorFlowLiteRecipes_DIR}")
-set(CIRCLE_RECIPE_REPO "${CircleRecipes_DIR}")
-unset(RECIPE_REPO)
-
-set(TEST_RECIPE_FILENAME "test.recipe")
-set(TEST_RULE_FILENAME "test.rule")
-
 unset(TEST_DEPS)
 unset(TEST_NAMES)
 
@@ -27,21 +14,9 @@ set(options "")
 set(oneValueArgs "")
 set(multiValueArgs PASS)
 
-macro(Add RECIPE)
-  if(NOT EXISTS "${TFLITE_RECIPE_REPO}/${RECIPE}/test.recipe")
-    if(NOT EXISTS "${CIRCLE_RECIPE_REPO}/${RECIPE}/test.recipe")
-      message(FATAL_ERROR "Missing recipe of '${RECIPE}' test")
-    else()
-      set(RECIPE_REPO ${CIRCLE_RECIPE_REPO})
-    endif()
-  else()
-    set(RECIPE_REPO ${TFLITE_RECIPE_REPO})
-  endif()
-
-  if(NOT EXISTS "${RECIPE_REPO}/${RECIPE}/test.rule")
-    message(FATAL_ERROR "Missing rule of '${RECIPE}' test")
-  endif()
+get_target_property(ARTIFACTS_BIN_PATH testDataGenerator BINARY_DIR)
 
+macro(Add RECIPE)
   cmake_parse_arguments(ARG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
   unset(OPT_OPTIONS)
   foreach(src ${ARG_PASS})
@@ -49,71 +24,20 @@ macro(Add RECIPE)
     list(APPEND OPT_OPTIONS "--${src}")
   endforeach(src ${ARG_PASS})
 
-  set(RECIPE_FILE "${RECIPE}.recipe")
-  set(RECIPE_SOURCE_PATH "${RECIPE_REPO}/${RECIPE}/${TEST_RECIPE_FILENAME}")
-  set(RECIPE_BINARY_PATH "${CMAKE_CURRENT_BINARY_DIR}/${RECIPE_FILE}")
-  
-  set(RULE_FILE "${RECIPE}.rule")
-  set(RULE_SOURCE_PATH "${RECIPE_REPO}/${RECIPE}/${TEST_RULE_FILENAME}")
-  set(RULE_BINARY_PATH "${CMAKE_CURRENT_BINARY_DIR}/${RULE_FILE}")
-
-  set(TFLITE_FILE "${RECIPE}.tflite")
-  set(TFLITE_OUTPUT_PATH "${CMAKE_CURRENT_BINARY_DIR}/${TFLITE_FILE}")
-
   set(CIRCLE_FILE "${RECIPE}.circle")
-  set(CIRCLE_OUTPUT_PATH "${CMAKE_CURRENT_BINARY_DIR}/${CIRCLE_FILE}")
+  set(CIRCLE_PATH "${ARTIFACTS_BIN_PATH}/${CIRCLE_FILE}")
 
   set(OPT_CIRCLE_FILE "${RECIPE}.opt.circle")
   set(OPT_CIRCLE_OUTPUT_PATH "${CMAKE_CURRENT_BINARY_DIR}/${OPT_CIRCLE_FILE}")
 
-  # Copy .recipe
-  add_custom_command(OUTPUT ${RECIPE_BINARY_PATH}
-    COMMAND ${CMAKE_COMMAND} -E copy "${RECIPE_SOURCE_PATH}" "${RECIPE_BINARY_PATH}"
-    DEPENDS ${RECIPE_SOURCE_PATH}
-    COMMENT "Generate ${RECIPE_FILE}"
-  )
-
-  # Copy .rule
-  add_custom_command(OUTPUT ${RULE_BINARY_PATH}
-    COMMAND ${CMAKE_COMMAND} -E copy "${RULE_SOURCE_PATH}" "${RULE_BINARY_PATH}"
-    DEPENDS ${RULE_SOURCE_PATH}
-    COMMENT "Generate ${RULE_FILE}"
-  )
-
-  if(${RECIPE_REPO} STREQUAL ${TFLITE_RECIPE_REPO})
-    # Generate .tflite
-    add_custom_command(OUTPUT ${TFLITE_OUTPUT_PATH}
-      COMMAND $<TARGET_FILE:tflchef-file> ${RECIPE_BINARY_PATH} ${TFLITE_OUTPUT_PATH}
-      DEPENDS $<TARGET_FILE:tflchef-file> ${RECIPE_BINARY_PATH}
-      COMMENT "Generate ${TFLITE_FILE}"
-    )
-
-    # Generate .circle
-    add_custom_command(OUTPUT ${CIRCLE_OUTPUT_PATH}
-      COMMAND $<TARGET_FILE:tflite2circle> ${TFLITE_OUTPUT_PATH} ${CIRCLE_OUTPUT_PATH}
-      DEPENDS $<TARGET_FILE:tflite2circle> ${TFLITE_OUTPUT_PATH}
-      COMMENT "Generate ${CIRCLE_FILE}"
-    )
-
-    list(APPEND TEST_DEPS ${TFLITE_OUTPUT_PATH})
-  else()
-   # Generate .circle
-    add_custom_command(OUTPUT ${CIRCLE_OUTPUT_PATH}
-      COMMAND $<TARGET_FILE:circlechef-file> ${RECIPE_BINARY_PATH} ${CIRCLE_OUTPUT_PATH}
-      DEPENDS $<TARGET_FILE:circlechef-file> ${RECIPE_BINARY_PATH}
-      COMMENT "Generate ${CIRCLE_FILE}"
-    )
-  endif()
-
   # Generate optimized .circle
   add_custom_command(OUTPUT ${OPT_CIRCLE_OUTPUT_PATH}
-    COMMAND $<TARGET_FILE:circle2circle> ${OPT_OPTIONS} ${CIRCLE_OUTPUT_PATH} ${OPT_CIRCLE_OUTPUT_PATH}
-    DEPENDS $<TARGET_FILE:circle2circle> ${CIRCLE_OUTPUT_PATH}
+    COMMAND $<TARGET_FILE:circle2circle> ${OPT_OPTIONS} ${CIRCLE_PATH} ${OPT_CIRCLE_OUTPUT_PATH}
+    DEPENDS $<TARGET_FILE:circle2circle> ${CIRCLE_PATH}
     COMMENT "Generate ${OPT_CIRCLE_FILE}"
   )
 
-  list(APPEND TEST_DEPS ${RECIPE_BINARY_PATH} ${RULE_BINARY_PATH}
-                        ${CIRCLE_OUTPUT_PATH} ${OPT_CIRCLE_OUTPUT_PATH})
+  list(APPEND TEST_DEPS ${OPT_CIRCLE_OUTPUT_PATH})
   list(APPEND TEST_NAMES ${RECIPE})
 endmacro(Add)
 
@@ -174,12 +98,15 @@ list(APPEND TEST_DEPS "${RULE_LIB_BINARY_PATH}")
 
 # Generate dependencies
 add_custom_target(circle2circle_dredd_recipe_test ALL DEPENDS ${TEST_DEPS})
+add_dependencies(circle2circle_dredd_recipe_test common_artifacts_deps)
+
+get_target_property(ARTIFACTS_BIN_PATH testDataGenerator BINARY_DIR)
 
 # Run tests
 add_test(
   NAME circle2circle_dredd_recipe_test
   COMMAND "${TEST_RUNNER}"
           "${TEST_CONFIG}"
-          "${CMAKE_CURRENT_BINARY_DIR}"
+          "${ARTIFACTS_BIN_PATH}"
           ${TEST_NAMES}
 )
diff --git a/compiler/circle2circle-dredd-recipe-test/requires.cmake b/compiler/circle2circle-dredd-recipe-test/requires.cmake
index e4a5b71..70e7c52 100644
--- a/compiler/circle2circle-dredd-recipe-test/requires.cmake
+++ b/compiler/circle2circle-dredd-recipe-test/requires.cmake
@@ -1,7 +1,5 @@
-require("circlechef")
 require("circle2circle")
 require("circle-inspect")
 require("circle-verify")
+require("common-artifacts")
 require("dredd-rule-lib")
-require("tflchef")
-require("tflite2circle")
diff --git a/compiler/circle2circle-dredd-recipe-test/test.lst b/compiler/circle2circle-dredd-recipe-test/test.lst
index 202f669..6328a64 100644
--- a/compiler/circle2circle-dredd-recipe-test/test.lst
+++ b/compiler/circle2circle-dredd-recipe-test/test.lst
@@ -11,9 +11,10 @@
 ## TFLITE RECIPE
 
 Add(Net_InstanceNorm_001 PASS fuse_instnorm)
-# Add(Net_InstanceNorm_002 PASS fuse_instnorm)
+Add(Net_InstanceNorm_002 PASS fuse_instnorm)
 Add(BatchMatMulV2_000 PASS resolve_customop_batchmatmul)
 Add(MatMul_000 PASS resolve_customop_matmul)
+Add(DepthwiseConv2D_003 PASS)
 
 ## CIRCLE RECIPE
 
diff --git a/compiler/circle2circle-dredd-recipe-test/testall.sh b/compiler/circle2circle-dredd-recipe-test/testall.sh
index 33a2036..2899587 100755
--- a/compiler/circle2circle-dredd-recipe-test/testall.sh
+++ b/compiler/circle2circle-dredd-recipe-test/testall.sh
@@ -13,21 +13,22 @@ if [[ $# -lt 2 ]]; then
   exit 255
 fi
 
+WORKDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 CONFIG_PATH="$1"; shift
-WORKDIR="$1"; shift
+RESOURCE_DIR="$1"; shift
 
 source "${CONFIG_PATH}"
 
 echo "-- Found circle-inspect: ${CIRCLE_INSPECT_PATH}"
 echo "-- Found circle-verify: ${CIRCLE_VERIFY_PATH}"
 echo "-- Found circle2circle: ${CIRCLE2CIRCLE_PATH}"
-echo "-- Found workdir: ${WORKDIR}"
+echo "-- Found common-artifacts: ${RESOURCE_DIR}"
 
 TESTED=()
 PASSED=()
 FAILED=()
 
-pushd "${WORKDIR}"
+pushd ${WORKDIR}
 while [[ $# -ne 0 ]]; do
   PREFIX="$1"; shift
 
@@ -40,7 +41,7 @@ while [[ $# -ne 0 ]]; do
   cat > "${PREFIX}.log" <(
     exec 2>&1
 
-    echo "-- Found tflite: ${PREFIX}.tflite"
+    echo "-- Found circle: ${PREFIX}.opt.circle"
 
     # Exit immediately if any command fails
     set -e
@@ -55,7 +56,7 @@ while [[ $# -ne 0 ]]; do
     set +x
 
     # (COMPILED_FILE, INSPECT_PROG_PATH, VERIFY_PROG_PATH, ERROR_LOG) must be set for rule-lib.sh
-    COMPILED_FILE="${WORKDIR}/${PREFIX}.opt.circle"
+    COMPILED_FILE="${PREFIX}.opt.circle"
     INSPECT_PROG_PATH=${CIRCLE_INSPECT_PATH}
     VERIFY_PROG_PATH=${CIRCLE_VERIFY_PATH}
     ERROR_LOG="${PREFIX}.error"
@@ -66,7 +67,7 @@ while [[ $# -ne 0 ]]; do
     trap 'echo "** ERROR **" ; cat "${ERROR_LOG}"' ERR
 
     source rule-lib.sh
-    source "${PREFIX}.rule"
+    source "${RESOURCE_DIR}/${PREFIX}.rule"
 
     # unset
     trap - ERR
diff --git a/compiler/circle2circle/CMakeLists.txt b/compiler/circle2circle/CMakeLists.txt
index 7b2bf9b..f60c896 100644
--- a/compiler/circle2circle/CMakeLists.txt
+++ b/compiler/circle2circle/CMakeLists.txt
@@ -19,6 +19,7 @@ target_link_libraries(circle2circle luci_service)
 target_link_libraries(circle2circle luci_pass)
 target_link_libraries(circle2circle luci_export)
 target_link_libraries(circle2circle arser)
+target_link_libraries(circle2circle vconone)
 
 install(TARGETS circle2circle DESTINATION bin)
 
@@ -44,3 +45,4 @@ target_link_libraries(circle2circle_test luci_service)
 target_link_libraries(circle2circle_test luci_pass)
 target_link_libraries(circle2circle_test luci_export)
 target_link_libraries(circle2circle_test arser)
+target_link_libraries(circle2circle_test vconone)
diff --git a/compiler/circle2circle/requires.cmake b/compiler/circle2circle/requires.cmake
index 8cbb90d..36a9efd 100644
--- a/compiler/circle2circle/requires.cmake
+++ b/compiler/circle2circle/requires.cmake
@@ -9,3 +9,4 @@ require("hermes")
 require("hermes-std")
 require("luci")
 require("arser")
+require("vconone")
diff --git a/compiler/circle2circle/src/Circle2Circle.cpp b/compiler/circle2circle/src/Circle2Circle.cpp
index 6888d26..849597b 100644
--- a/compiler/circle2circle/src/Circle2Circle.cpp
+++ b/compiler/circle2circle/src/Circle2Circle.cpp
@@ -26,6 +26,7 @@
 
 #include <oops/InternalExn.h>
 #include <arser/arser.h>
+#include <vconone/vconone.h>
 
 #include <functional>
 #include <iostream>
@@ -34,6 +35,12 @@
 using Algorithms = luci::CircleOptimizer::Options::Algorithm;
 using AlgorithmParameters = luci::CircleOptimizer::Options::AlgorithmParameters;
 
+void print_version(void)
+{
+  std::cout << "circle2circle version " << vconone::get_string() << std::endl;
+  std::cout << vconone::get_copyright() << std::endl;
+}
+
 int entry(int argc, char **argv)
 {
   // Simple argument parser (based on map)
@@ -44,6 +51,13 @@ int entry(int argc, char **argv)
 
   arser::Arser arser("circle2circle provides circle model optimization and transformations");
 
+  arser.add_argument("--version")
+      .nargs(0)
+      .required(false)
+      .default_value(false)
+      .help("Show version information and exit")
+      .exit_with(print_version);
+
   arser.add_argument("--all").nargs(0).required(false).default_value(false).help(
       "Enable all optimize options");
 
diff --git a/compiler/circlechef/CMakeLists.txt b/compiler/circlechef/CMakeLists.txt
index cba7d0a..3e2ddcb 100644
--- a/compiler/circlechef/CMakeLists.txt
+++ b/compiler/circlechef/CMakeLists.txt
@@ -18,4 +18,6 @@ add_subdirectory(core)
 add_subdirectory(circle)
 # Tools
 add_subdirectory(tools)
-add_subdirectory(tests)
+if(ENABLE_TEST)
+  add_subdirectory(tests)
+endif(ENABLE_TEST)
diff --git a/compiler/circlechef/circle/src/RecipeChef.cpp b/compiler/circlechef/circle/src/RecipeChef.cpp
index 17ef1be..51326c7 100644
--- a/compiler/circlechef/circle/src/RecipeChef.cpp
+++ b/compiler/circlechef/circle/src/RecipeChef.cpp
@@ -181,6 +181,8 @@ std::unique_ptr<ModelRecipe> generate_recipe(const circle::Model *model)
         for (uint32_t idx = 0; idx < quant->zero_point()->size(); ++idx)
           chef_quant->add_zero_point(quant->zero_point()->Get(idx));
       }
+      circlechef::TensorQuantization *chef_quant = operand->mutable_quant();
+      chef_quant->set_quantized_dimension(quant->quantized_dimension());
     }
   }
 
diff --git a/compiler/circlechef/core/src/ModelChef.cpp b/compiler/circlechef/core/src/ModelChef.cpp
index 76aeacd..d81467d 100644
--- a/compiler/circlechef/core/src/ModelChef.cpp
+++ b/compiler/circlechef/core/src/ModelChef.cpp
@@ -413,6 +413,7 @@ template <typename T> void cook_graph(const T &graph, CookParams &cp)
       quant_builder.add_min(quant_min);
       quant_builder.add_scale(quant_scale);
       quant_builder.add_zero_point(quant_zero_point);
+      quant_builder.add_quantized_dimension(quant.quantized_dimension());
 
       // Update QuantizationParameters Index
       quant_index = quant_builder.Finish();
diff --git a/compiler/circlechef/proto/circlechef.proto b/compiler/circlechef/proto/circlechef.proto
index b8c009b..3e5e6b1 100644
--- a/compiler/circlechef/proto/circlechef.proto
+++ b/compiler/circlechef/proto/circlechef.proto
@@ -35,6 +35,7 @@ message TensorQuantization {
   repeated float max = 2;
   repeated float scale = 3;
   repeated int64 zero_point = 4;
+  optional int32 quantized_dimension = 5 [default = 0];
 }
 
 message Operand {
diff --git a/compiler/circlechef/tools/file/Driver.cpp b/compiler/circlechef/tools/file/Driver.cpp
index a15da40..bcc0c7a 100644
--- a/compiler/circlechef/tools/file/Driver.cpp
+++ b/compiler/circlechef/tools/file/Driver.cpp
@@ -41,7 +41,7 @@ int entry(int argc, char **argv)
   {
     std::cout << err.what() << std::endl;
     std::cout << arser;
-    return 0;
+    return 255;
   }
 
   int32_t model_version = 1;
diff --git a/compiler/circlechef/tools/reverse/Driver.cpp b/compiler/circlechef/tools/reverse/Driver.cpp
index 9c0b9ea..8a2b85f 100644
--- a/compiler/circlechef/tools/reverse/Driver.cpp
+++ b/compiler/circlechef/tools/reverse/Driver.cpp
@@ -38,7 +38,7 @@ int entry(int argc, char **argv)
   {
     std::cout << err.what() << std::endl;
     std::cout << arser;
-    return 0;
+    return 255;
   }
 
   std::string circle_path = arser.get<std::string>("circle");
diff --git a/compiler/circledump/driver/Driver.cpp b/compiler/circledump/driver/Driver.cpp
index b8f561f..657f24f 100644
--- a/compiler/circledump/driver/Driver.cpp
+++ b/compiler/circledump/driver/Driver.cpp
@@ -33,7 +33,7 @@ int entry(int argc, char **argv)
   {
     std::cout << err.what() << '\n';
     std::cout << arser;
-    return 0;
+    return 255;
   }
 
   std::string circle_path = arser.get<std::string>("circle");
diff --git a/compiler/circledump/src/OpPrinter.cpp b/compiler/circledump/src/OpPrinter.cpp
index 2c03203..5aa5d51 100644
--- a/compiler/circledump/src/OpPrinter.cpp
+++ b/compiler/circledump/src/OpPrinter.cpp
@@ -593,6 +593,20 @@ public:
   }
 };
 
+class UniquePrinter : public OpPrinter
+{
+public:
+  void options(const circle::Operator *op, std::ostream &os) const override
+  {
+    if (auto *params = op->builtin_options_as_UniqueOptions())
+    {
+      os << "    ";
+      os << "idx_out_type(" << EnumNameTensorType(params->idx_out_type()) << ") ";
+      os << std::endl;
+    }
+  }
+};
+
 class WhilePrinter : public OpPrinter
 {
 public:
@@ -744,6 +758,7 @@ OpPrinterRegistry::OpPrinterRegistry()
   _op_map[circle::BuiltinOperator_SUM] = make_unique<ReducerPrinter>();
   _op_map[circle::BuiltinOperator_TRANSPOSE_CONV] = make_unique<TransposeConvPrinter>();
   // There is no Option for TOPK_V2
+  _op_map[circle::BuiltinOperator_UNIQUE] = make_unique<UniquePrinter>();
   _op_map[circle::BuiltinOperator_WHILE] = make_unique<WhilePrinter>();
   _op_map[circle::BuiltinOperator_CUSTOM] = make_unique<CustomOpPrinter>();
 
diff --git a/compiler/common-artifacts/exclude.lst b/compiler/common-artifacts/exclude.lst
index b614b71..d3f5601 100644
--- a/compiler/common-artifacts/exclude.lst
+++ b/compiler/common-artifacts/exclude.lst
@@ -5,9 +5,12 @@
 
 #[[ optimize : Exclude from circle optimization(circle2circle) ]]
 ## TensorFlowLiteRecipes
-optimize(ReLU6_000)
-optimize(Where_000)
-optimize(Where_001)
+optimize(Unique_000)
+optimize(Unique_001)
+optimize(Unique_002)
+optimize(Unique_003)
+optimize(Unique_U8_000)
+optimize(Unique_U8_001)
 
 ## CircleRecipes
 
@@ -46,6 +49,7 @@ tcgenerate(DepthToSpace_000)
 tcgenerate(DepthwiseConv2D_001) # runtime doesn't support dilation
 tcgenerate(DepthwiseConv2D_003) # runtime doesn't support dilation
 tcgenerate(DepthwiseConv2D_U8_000)
+tcgenerate(DepthwiseConv2D_U8_001)  # luci-interpreter doesn't support channel-wise quantization yet
 tcgenerate(Div_000)
 tcgenerate(ELU_000)
 tcgenerate(Equal_000)
@@ -96,7 +100,7 @@ tcgenerate(Neg_000)
 tcgenerate(Net_Dangle_001)
 tcgenerate(Net_InstanceNorm_001)
 tcgenerate(Net_InstanceNorm_002)
-tcgenerate(Net_ZeroDim_001) # fix luci
+tcgenerate(Net_ZeroDim_001) # luci-interpreter doesn't support zero dim
 tcgenerate(NotEqual_000)
 tcgenerate(OneHot_000)
 tcgenerate(OneHot_001)
@@ -120,9 +124,9 @@ tcgenerate(ReduceProd_001)
 tcgenerate(ReduceProd_002)
 tcgenerate(ReduceProd_003)
 tcgenerate(ReLU_000)
-tcgenerate(ReLU6_000) # luci NYI
+tcgenerate(ReLU6_000)
 tcgenerate(ReLUN1To1_000)
-tcgenerate(Reshape_003) # fix luci
+tcgenerate(Reshape_003) # luci-interpreter doesn't support reshape without built-in option
 tcgenerate(Reshape_U8_000)
 tcgenerate(ResizeBilinear_000)
 tcgenerate(ResizeNearestNeighbor_000)
@@ -148,7 +152,7 @@ tcgenerate(SpaceToBatchND_002)
 tcgenerate(SpaceToBatchND_003)
 tcgenerate(SpaceToDepth_000)
 tcgenerate(SparseToDense_000)
-tcgenerate(SplitV_000) # fix luci
+tcgenerate(SplitV_000)
 tcgenerate(Sqrt_000)
 tcgenerate(Square_000)
 tcgenerate(SquaredDifference_000)
@@ -164,22 +168,21 @@ tcgenerate(Sum_001)
 tcgenerate(Tanh_000)
 tcgenerate(Tile_000)
 tcgenerate(Tile_U8_000)
-tcgenerate(TopKV2_000) # fix luci
-tcgenerate(TopKV2_001) # fix luci
-tcgenerate(TransposeConv_000) # fix interpreter
+tcgenerate(TopKV2_000)
+tcgenerate(TopKV2_001)
 tcgenerate(Unique_000)
 tcgenerate(Unique_001)
 tcgenerate(Unique_002)
 tcgenerate(Unique_003)
 tcgenerate(Unique_U8_000)
 tcgenerate(Unique_U8_001)
-tcgenerate(Where_000) # luci NYI
-tcgenerate(Where_001) # luci NYI
-tcgenerate(While_000) # fix luci
+tcgenerate(Where_000)
+tcgenerate(Where_001)
+tcgenerate(While_000)
 tcgenerate(While_001)
 tcgenerate(While_002)
 tcgenerate(While_003)
-tcgenerate(YUV_TO_RGB_000) # fix luci
+tcgenerate(YUV_TO_RGB_000)
 tcgenerate(YUV_TO_RGB_U8_000)
 tcgenerate(ZerosLike_000)
 
diff --git a/compiler/hermes/src/hermes.test.cpp b/compiler/hermes/src/hermes.test.cpp
index 2cbc093..ea7ef65 100644
--- a/compiler/hermes/src/hermes.test.cpp
+++ b/compiler/hermes/src/hermes.test.cpp
@@ -18,7 +18,28 @@
 
 #include <gtest/gtest.h>
 
-TEST(HermesTest, simple_usecase)
+namespace
 {
-  // TO BE FILLED
+
+class Logger final : public hermes::Source
+{
+public:
+  Logger(hermes::Context *ctx);
+  ~Logger();
+};
+
+Logger::Logger(hermes::Context *ctx) { activate(ctx->sources(), ctx->bus()); }
+Logger::~Logger() { deactivate(); }
+
+} // namespace
+
+TEST(HermesTest, logger_constructor_NEG)
+{
+  hermes::Context context;
+  // we expect segmentfault from nullptr->sources()
+  ASSERT_DEATH(Logger logger(&context), "");
+
+  SUCCEED();
 }
+
+// TODO add HermesTest simple_usecase
diff --git a/compiler/locomotiv/src/Node/BiasEncode.test.cpp b/compiler/locomotiv/src/Node/BiasEncode.test.cpp
index cdb255c..4680f5c 100644
--- a/compiler/locomotiv/src/Node/BiasEncode.test.cpp
+++ b/compiler/locomotiv/src/Node/BiasEncode.test.cpp
@@ -90,6 +90,16 @@ template <typename T> void test()
 }
 } // namespace
 
-TEST(NodeExecution_BiasEncode, s32) { test<int32_t>(); }
+TEST(NodeExecution_BiasEncode, s32)
+{
+  test<int32_t>();
+
+  SUCCEED();
+}
 
-TEST(NodeExecution_BiasEncode, f32) { test<float>(); }
+TEST(NodeExecution_BiasEncode, f32)
+{
+  test<float>();
+
+  SUCCEED();
+}
diff --git a/compiler/locomotiv/src/Node/MatMul.test.cpp b/compiler/locomotiv/src/Node/MatMul.test.cpp
index f1f3a52..7d942e1 100644
--- a/compiler/locomotiv/src/Node/MatMul.test.cpp
+++ b/compiler/locomotiv/src/Node/MatMul.test.cpp
@@ -142,6 +142,8 @@ TEST(NodeExecution_MatMul, f32_2x3_3x3)
   };
 
   run_test<float>(lhs, rhs, out, Shape{2, 3}, Shape{3, 3}, Shape{2, 3}, loco::DataType::FLOAT32);
+
+  SUCCEED();
 }
 
 /* from the code below:
@@ -183,6 +185,8 @@ TEST(NodeExecution_MatMul, s32_4x2_2x6)
   };
 
   run_test<int32_t>(lhs, rhs, out, Shape{4, 2}, Shape{2, 6}, Shape{4, 6}, loco::DataType::S32);
+
+  SUCCEED();
 }
 
 // clang-format on
diff --git a/compiler/locop/src/FormattedGraph.test.cpp b/compiler/locop/src/FormattedGraph.test.cpp
index c9808d3..aff9ebe 100644
--- a/compiler/locop/src/FormattedGraph.test.cpp
+++ b/compiler/locop/src/FormattedGraph.test.cpp
@@ -28,6 +28,8 @@ TEST(LinearV1FormatterTest, simple)
 
   // TODO Validate the output (when the implementation becomes stable)
   std::cout << locop::fmt<locop::LinearV1>(g) << std::endl;
+
+  SUCCEED();
 }
 
 TEST(LinearV1FormatterTest, user_defined_node_summary_builder)
diff --git a/compiler/locop/src/FormattedTensorShape.test.cpp b/compiler/locop/src/FormattedTensorShape.test.cpp
index 0f0017a..fc85df3 100644
--- a/compiler/locop/src/FormattedTensorShape.test.cpp
+++ b/compiler/locop/src/FormattedTensorShape.test.cpp
@@ -30,4 +30,6 @@ TEST(FormattedTensorShapeTest, BracketFormat)
   tensor_shape->dim(0) = 4;
 
   std::cout << fmt<TensorShapeFormat::Bracket>(tensor_shape.get()) << std::endl;
+
+  SUCCEED();
 }
diff --git a/compiler/luci-interpreter/include/luci_interpreter/core/Tensor.h b/compiler/luci-interpreter/include/luci_interpreter/core/Tensor.h
index 9987898..4ac3d86 100644
--- a/compiler/luci-interpreter/include/luci_interpreter/core/Tensor.h
+++ b/compiler/luci-interpreter/include/luci_interpreter/core/Tensor.h
@@ -79,12 +79,11 @@ private:
 //
 // Note that due to historical and performance reasons, per-tensor quantization uses unsigned
 // integer types, while per-channel uses signed types assuming 'zero_point' == 0.
-//
-// TODO Add 'quantized_dimension' field for per-channel case when IR provides it.
 struct AffineQuantization
 {
   std::vector<float> scale;
   std::vector<int32_t> zero_point;
+  int32_t quantized_dimension;
 };
 
 class Tensor
@@ -108,6 +107,12 @@ public:
     return _quantization.zero_point[0];
   }
 
+  const std::vector<float> &scales() const { return _quantization.scale; }
+
+  const std::vector<int32_t> &zero_points() const { return _quantization.zero_point; }
+
+  int32_t quantized_dimension() const { return _quantization.quantized_dimension; }
+
   template <typename T> const T *data() const { return reinterpret_cast<const T *>(_data.get()); }
 
   template <typename T> T *data() { return reinterpret_cast<T *>(_data.get()); }
diff --git a/compiler/luci-interpreter/src/core/KernelParams.h b/compiler/luci-interpreter/src/core/KernelParams.h
index a32e0d4..65d1197 100644
--- a/compiler/luci-interpreter/src/core/KernelParams.h
+++ b/compiler/luci-interpreter/src/core/KernelParams.h
@@ -56,6 +56,11 @@ struct Conv2DParams
   Activation activation;
 };
 
+struct DepthToSpaceParams
+{
+  int block_size;
+};
+
 struct DepthwiseConv2DParams
 {
   Padding padding;
diff --git a/compiler/luci-interpreter/src/kernels/CMakeLists.txt b/compiler/luci-interpreter/src/kernels/CMakeLists.txt
index fe36231..a1fd1de 100644
--- a/compiler/luci-interpreter/src/kernels/CMakeLists.txt
+++ b/compiler/luci-interpreter/src/kernels/CMakeLists.txt
@@ -12,6 +12,8 @@ set(SOURCES
     Concatenation.cpp
     Conv2D.h
     Conv2D.cpp
+    DepthToSpace.h
+    DepthToSpace.cpp
     DepthwiseConv2D.h
     DepthwiseConv2D.cpp
     Elu.h
@@ -40,6 +42,10 @@ set(SOURCES
     Pad.cpp
     Reshape.h
     Reshape.cpp
+    Reverse.h
+    Reverse.cpp
+    Slice.h
+    Slice.cpp
     Softmax.h
     Softmax.cpp
     SpaceToDepth.h
@@ -77,6 +83,7 @@ set(TEST_SOURCES
     AveragePool2D.test.cpp
     Concatenation.test.cpp
     Conv2D.test.cpp
+    DepthToSpace.test.cpp
     DepthwiseConv2D.test.cpp
     Elu.test.cpp
     FullyConnected.test.cpp
@@ -91,6 +98,8 @@ set(TEST_SOURCES
     Mul.test.cpp
     Pad.test.cpp
     Reshape.test.cpp
+    Reverse.test.cpp
+    Slice.test.cpp
     Softmax.test.cpp
     SpaceToDepth.test.cpp
     Split.test.cpp
diff --git a/compiler/luci-interpreter/src/kernels/DepthToSpace.cpp b/compiler/luci-interpreter/src/kernels/DepthToSpace.cpp
new file mode 100644
index 0000000..cab63e2
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/DepthToSpace.cpp
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DepthToSpace.h"
+#include "Utils.h"
+#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+DepthToSpace::DepthToSpace(const Tensor *input, Tensor *output, const DepthToSpaceParams &params)
+    : KernelWithParams<DepthToSpaceParams>({input}, {output}, params)
+{
+}
+
+void DepthToSpace::configure()
+{
+  if (input()->shape().num_dims() != 4)
+  {
+    throw std::runtime_error("Invalid input num_dims.");
+  }
+  if (output()->element_type() != DataType::FLOAT32 && output()->element_type() != DataType::U8 &&
+      output()->element_type() != DataType::S8 && output()->element_type() != DataType::S32 &&
+      output()->element_type() != DataType::S64)
+  {
+    throw std::runtime_error("Invalid output type");
+  }
+  if (input()->element_type() != output()->element_type())
+  {
+    throw std::runtime_error("Type mismatch on input and output.");
+  }
+  const int block_size = params().block_size;
+  const int32_t input_height = input()->shape().dim(1);
+  const int32_t input_width = input()->shape().dim(2);
+  const int32_t input_channels = input()->shape().dim(3);
+  int32_t output_height = input_height * block_size;
+  int32_t output_width = input_width * block_size;
+  int32_t output_channels = input_channels / block_size / block_size;
+
+  assert(input_height == output_height / block_size);
+  assert(input_width == output_width / block_size);
+  assert(input_channels == output_channels * block_size * block_size);
+
+  Shape output_shape(4);
+  output_shape.dim(0) = input()->shape().dim(0);
+  output_shape.dim(1) = output_height;
+  output_shape.dim(2) = output_width;
+  output_shape.dim(3) = output_channels;
+
+  output()->resize(output_shape);
+}
+
+void DepthToSpace::execute() const
+{
+  tflite::DepthToSpaceParams op_params;
+  op_params.block_size = params().block_size;
+  switch (input()->element_type())
+  {
+    case DataType::FLOAT32:
+      tflite::optimized_ops::DepthToSpace(op_params, getTensorShape(input()),
+                                          getTensorData<float>(input()), getTensorShape(output()),
+                                          getTensorData<float>(output()));
+      break;
+    case DataType::U8:
+      tflite::optimized_ops::DepthToSpace(op_params, getTensorShape(input()),
+                                          getTensorData<uint8_t>(input()), getTensorShape(output()),
+                                          getTensorData<uint8_t>(output()));
+      break;
+    default:
+      throw std::runtime_error("Unsupported Type.");
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/DepthToSpace.h b/compiler/luci-interpreter/src/kernels/DepthToSpace.h
new file mode 100644
index 0000000..63ce376
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/DepthToSpace.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_DEPTHTOSPACE_H
+#define LUCI_INTERPRETER_KERNELS_DEPTHTOSPACE_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+#include <vector>
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class DepthToSpace : public KernelWithParams<DepthToSpaceParams>
+{
+public:
+  DepthToSpace(const Tensor *input, Tensor *output, const DepthToSpaceParams &params);
+
+  const Tensor *input() const { return _inputs[0]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_DEPTHTOSPACE_H
diff --git a/compiler/luci-interpreter/src/kernels/DepthToSpace.test.cpp b/compiler/luci-interpreter/src/kernels/DepthToSpace.test.cpp
new file mode 100644
index 0000000..1b80570
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/DepthToSpace.test.cpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/DepthToSpace.h"
+#include "kernels/TestUtils.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+template <typename T> class DepthToSpaceTest : public ::testing::Test
+{
+};
+
+using DataTypes = ::testing::Types<float, uint8_t>;
+TYPED_TEST_CASE(DepthToSpaceTest, DataTypes);
+
+TYPED_TEST(DepthToSpaceTest, SimpleCase)
+{
+  std::vector<TypeParam> input_data{1, 2, 3, 4, 5, 6, 7, 8};
+  Shape input_shape{1, 1, 2, 4};
+  std::vector<TypeParam> output_data{1, 2, 5, 6, 3, 4, 7, 8};
+  std::vector<int32_t> output_shape{1, 2, 4, 1};
+
+  Tensor input_tensor = makeInputTensor<getElementType<TypeParam>()>(input_shape, input_data);
+  Tensor output_tensor = makeOutputTensor(getElementType<TypeParam>());
+
+  DepthToSpaceParams params{};
+  params.block_size = 2;
+
+  DepthToSpace kernel = DepthToSpace(&input_tensor, &output_tensor, params);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<TypeParam>(output_tensor),
+              ::testing::ElementsAreArray(output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/L2Normalize.test.cpp b/compiler/luci-interpreter/src/kernels/L2Normalize.test.cpp
index fad450d..f53eaca 100644
--- a/compiler/luci-interpreter/src/kernels/L2Normalize.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/L2Normalize.test.cpp
@@ -45,12 +45,9 @@ TEST(L2NormalizeTest, Float)
               ElementsAreArray(ArrayFloatNear(ref_output_data)));
 }
 
-TEST(L2NormalizeTest, Uint8Quantized)
-{
-  // TODO
-  // Implement GetDequantizedOutput Function.
-  // Create Test for Uint8 Case
-}
+// TODO Uint8Quantized
+// Implement GetDequantizedOutput Function.
+// Create Test for Uint8 Case
 
 } // namespace
 } // namespace kernels
diff --git a/compiler/luci-interpreter/src/kernels/LeakyRelu.test.cpp b/compiler/luci-interpreter/src/kernels/LeakyRelu.test.cpp
index b0c06e7..c79d3d6 100644
--- a/compiler/luci-interpreter/src/kernels/LeakyRelu.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/LeakyRelu.test.cpp
@@ -61,15 +61,14 @@ TEST(LeakReluTest, FloatSimple)
                    1.0f, -0.5f, -1.0f, // Row 2
                },
                /*alpha=*/0.5f, getElementType<float>());
-}
 
-TEST(LeakReluTest, Uint8Simple)
-{
-  // TODO
-  // Implement GetDequantizedOutput Function.
-  // Create Test for Uint8 Case
+  SUCCEED();
 }
 
+// TODO Uint8Simple
+// Implement GetDequantizedOutput Function.
+// Create Test for Uint8 Case
+
 } // namespace
 } // namespace kernels
 } // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/Logistic.test.cpp b/compiler/luci-interpreter/src/kernels/Logistic.test.cpp
index 17456a4..00feddf 100644
--- a/compiler/luci-interpreter/src/kernels/Logistic.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Logistic.test.cpp
@@ -49,10 +49,8 @@ TEST(LogisticTest, Float)
   // TODO make a Shape checking of output_tensor.
 }
 
-TEST(LogisticTest, Uint8)
-{
-  // Need to Implement GetDequantizedOutput Function.
-}
+// TODO Uint8
+// Need to Implement GetDequantizedOutput Function.
 
 } // namespace
 } // namespace kernels
diff --git a/compiler/luci-interpreter/src/kernels/Reverse.cpp b/compiler/luci-interpreter/src/kernels/Reverse.cpp
new file mode 100644
index 0000000..a463084
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Reverse.cpp
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Reverse.h"
+#include "kernels/Utils.h"
+#include <tensorflow/lite/kernels/internal/reference/reference_ops.h>
+
+namespace luci_interpreter
+{
+
+namespace kernels
+{
+
+Reverse::Reverse(const Tensor *input, const Tensor *axes, Tensor *output)
+    : Kernel({input, axes}, {output})
+{
+}
+
+void Reverse::configure()
+{
+  assert(axes()->shape().num_dims() == 1);
+  assert(input()->shape().num_dims() >= axes()->shape().num_elements());
+  if (input()->element_type() != DataType::S32 && input()->element_type() != DataType::FLOAT32 &&
+      input()->element_type() != DataType::U8 && input()->element_type() != DataType::S16 &&
+      input()->element_type() != DataType::S64)
+  {
+    throw std::runtime_error("Unsupported input type.");
+  }
+  if (axes()->element_type() != DataType::S32)
+  {
+    throw std::runtime_error("Unsupported axes type.");
+  }
+  if (axes()->shape().num_elements() > 1)
+  {
+    throw std::runtime_error("Current implementation does not support more than 1 axis.");
+  }
+  int axis_value = getTensorData<int32_t>(axes())[0];
+  if (axis_value < 0 || axis_value >= input()->shape().num_dims())
+  {
+    throw std::runtime_error("Invalid axes value");
+  }
+  assert(input()->element_type() == output()->element_type());
+
+  output()->resize(input()->shape());
+}
+
+void Reverse::execute() const
+{
+  int axis_value = getTensorData<int32_t>(axes())[0];
+  switch (output()->element_type())
+  {
+    case DataType::FLOAT32:
+      tflite::reference_ops::Reverse<float>(axis_value, getTensorShape(input()),
+                                            getTensorData<float>(input()), getTensorShape(output()),
+                                            getTensorData<float>(output()));
+      break;
+    case DataType::U8:
+      tflite::reference_ops::Reverse<uint8_t>(
+          axis_value, getTensorShape(input()), getTensorData<uint8_t>(input()),
+          getTensorShape(output()), getTensorData<uint8_t>(output()));
+      break;
+    default:
+      throw std::runtime_error("Unsupported output type");
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/Reverse.h b/compiler/luci-interpreter/src/kernels/Reverse.h
new file mode 100644
index 0000000..3489dae
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Reverse.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_REVERSE_H
+#define LUCI_INTERPRETER_KERNELS_REVERSE_H
+
+#include "core/Kernel.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Reverse : public Kernel
+{
+public:
+  Reverse(const Tensor *input, const Tensor *axes, Tensor *output);
+
+  const Tensor *input() const { return _inputs[0]; }
+  const Tensor *axes() const { return _inputs[1]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_REVERSE_H
diff --git a/compiler/luci-interpreter/src/kernels/Reverse.test.cpp b/compiler/luci-interpreter/src/kernels/Reverse.test.cpp
new file mode 100644
index 0000000..5475a8b
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Reverse.test.cpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Reverse.h"
+#include "kernels/TestUtils.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+template <typename T> class ReverseTest : public ::testing::Test
+{
+};
+
+using DataTypes = ::testing::Types<float, uint8_t>;
+TYPED_TEST_CASE(ReverseTest, DataTypes);
+
+TYPED_TEST(ReverseTest, MultiDimensions)
+{
+  // TypeParam
+  std::vector<TypeParam> input_data{1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
+                                    13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24};
+  Shape input_shape{4, 3, 2};
+  std::vector<int32_t> axis_data{1};
+  Shape axis_shape{1};
+
+  std::vector<TypeParam> output_data{5,  6,  3,  4,  1,  2,  11, 12, 9,  10, 7,  8,
+                                     17, 18, 15, 16, 13, 14, 23, 24, 21, 22, 19, 20};
+  std::vector<int32_t> output_shape{4, 3, 2};
+
+  Tensor input_tensor = makeInputTensor<getElementType<TypeParam>()>(input_shape, input_data);
+  Tensor axis_tensor = makeInputTensor<DataType::S32>(axis_shape, axis_data);
+
+  Tensor output_tensor = makeOutputTensor(getElementType<TypeParam>());
+
+  Reverse kernel = Reverse(&input_tensor, &axis_tensor, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<TypeParam>(output_tensor),
+              ::testing::ElementsAreArray(output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/Slice.cpp b/compiler/luci-interpreter/src/kernels/Slice.cpp
new file mode 100644
index 0000000..c4bc3c5
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Slice.cpp
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Slice.h"
+#include "Utils.h"
+#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+
+#include <cassert>
+#include <cstring>
+
+namespace luci_interpreter
+{
+
+namespace kernels
+{
+const int max_dim = 4;
+
+Slice::Slice(const Tensor *input, const Tensor *begin, const Tensor *size, Tensor *output)
+    : Kernel({input, begin, size}, {output})
+{
+}
+
+template <typename T>
+Shape calculateOutputShape(const Tensor *input, const Tensor *begin, const Tensor *size)
+{
+  Shape output_shape = Shape(input->shape().num_dims());
+  for (int idx = 0; idx < input->shape().num_dims(); idx++)
+  {
+    T size_value = getTensorData<T>(size)[idx];
+    if (size_value < 0)
+    {
+      if (size_value != -1)
+      {
+        throw std::runtime_error("Invalid size.");
+      }
+      size_value = input->shape().dim(idx) - getTensorData<T>(begin)[idx];
+    }
+    else
+    {
+      if (input->shape().dim(idx) < getTensorData<T>(begin)[idx] + size_value)
+      {
+        throw std::runtime_error("Invalid begin and size.");
+      }
+    }
+    output_shape.dim(idx) = static_cast<int>(size_value);
+  }
+  return output_shape;
+}
+
+template <typename T>
+void getBeginAndSizeVectors(int dimensions, const Tensor *begin, const Tensor *size,
+                            std::vector<int> *begins, std::vector<int> *sizes)
+{
+  for (int idx = dimensions - 1; idx >= 0; --idx)
+  {
+    begins->push_back(getTensorData<T>(begin)[idx]);
+    sizes->push_back(getTensorData<T>(size)[idx]);
+  }
+}
+
+void Slice::configure()
+{
+  assert(input()->element_type() == output()->element_type());
+  assert(begin()->element_type() == DataType::S32 || begin()->element_type() == DataType::S64);
+  assert(size()->element_type() == DataType::S32 || size()->element_type() == DataType::S64);
+  assert(begin()->shape().num_dims() == 1);
+  assert(size()->shape().num_dims() == 1);
+  assert(input()->shape().num_dims() <= max_dim);
+
+  if (begin()->element_type() == DataType::S32)
+  {
+    output()->resize(calculateOutputShape<int32_t>(input(), begin(), size()));
+  }
+  else if (begin()->element_type() == DataType::S64)
+  {
+    output()->resize(calculateOutputShape<int64_t>(input(), begin(), size()));
+  }
+  else
+  {
+    throw std::runtime_error("Unsupported type.");
+  }
+}
+
+void Slice::execute() const
+{
+  std::vector<int> begins;
+  begins.reserve(max_dim);
+  std::vector<int> sizes;
+  sizes.reserve(max_dim);
+  if (begin()->element_type() == DataType::S32)
+  {
+    getBeginAndSizeVectors<int32_t>(input()->shape().num_dims(), begin(), size(), &begins, &sizes);
+  }
+  else if (begin()->element_type() == DataType::S64)
+  {
+    getBeginAndSizeVectors<int64_t>(input()->shape().num_dims(), begin(), size(), &begins, &sizes);
+  }
+  else
+  {
+    throw std::runtime_error("Unsupported begin type.");
+  }
+  for (int i = input()->shape().num_dims(); i < max_dim; ++i)
+  {
+    begins.push_back(0);
+    sizes.push_back(1);
+  }
+
+  assert(begins.size() == 4);
+  assert(sizes.size() == 4);
+  tflite::SliceParams op_params{};
+  op_params.begin_count = 4;
+  op_params.size_count = 4;
+  for (int i = 0; i < 4; i++)
+  {
+    op_params.begin[i] = begins[3 - i];
+    op_params.size[i] = sizes[3 - i];
+  }
+  switch (input()->element_type())
+  {
+    case DataType::FLOAT32:
+      tflite::optimized_ops::Slice(op_params, getTensorShape(input()),
+                                   getTensorData<float>(input()), getTensorShape(output()),
+                                   getTensorData<float>(output()));
+      break;
+    case DataType::U8:
+      tflite::optimized_ops::Slice(op_params, getTensorShape(input()),
+                                   getTensorData<uint8_t>(input()), getTensorShape(output()),
+                                   getTensorData<uint8_t>(output()));
+      break;
+    default:
+      throw std::runtime_error("Unsupported input type.");
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/Slice.h b/compiler/luci-interpreter/src/kernels/Slice.h
new file mode 100644
index 0000000..23c3596
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Slice.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_SLICE_H
+#define LUCI_INTERPRETER_KERNELS_SLICE_H
+
+#include "core/Kernel.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Slice : public Kernel
+{
+public:
+  Slice(const Tensor *input, const Tensor *begin, const Tensor *size, Tensor *output);
+
+  const Tensor *input() const { return _inputs[0]; }
+  const Tensor *begin() const { return _inputs[1]; }
+  const Tensor *size() const { return _inputs[2]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_SLICE_H
diff --git a/compiler/luci-interpreter/src/kernels/Slice.test.cpp b/compiler/luci-interpreter/src/kernels/Slice.test.cpp
new file mode 100644
index 0000000..a360a29
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Slice.test.cpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Slice.h"
+#include "kernels/TestUtils.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+template <typename T> class SliceTest : public ::testing::Test
+{
+};
+
+using DataTypes = ::testing::Types<float, uint8_t>;
+TYPED_TEST_CASE(SliceTest, DataTypes);
+
+TYPED_TEST(SliceTest, SimpleTest)
+{
+  std::vector<TypeParam> input_data{1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6};
+  Shape input_shape{3, 2, 3, 1};
+  std::vector<int32_t> begin_data{1, 0, 0, 0};
+  Shape begin_shape{4};
+  std::vector<int32_t> size_data{2, 1, -1, 1};
+  Shape size_shape{4};
+  std::vector<TypeParam> output_data{3, 3, 3, 5, 5, 5};
+  std::vector<int32_t> output_shape{2, 1, 3, 1};
+
+  Tensor input_tensor = makeInputTensor<getElementType<TypeParam>()>(input_shape, input_data);
+  Tensor begin_tensor = makeInputTensor<DataType::S32>(begin_shape, begin_data);
+  Tensor size_tensor = makeInputTensor<DataType::S32>(size_shape, size_data);
+
+  Tensor output_tensor = makeOutputTensor(getElementType<TypeParam>());
+
+  Slice kernel(&input_tensor, &begin_tensor, &size_tensor, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<TypeParam>(output_tensor),
+              ::testing::ElementsAreArray(output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/TransposeConv.test.cpp b/compiler/luci-interpreter/src/kernels/TransposeConv.test.cpp
index 3386d36..b8c0ac4 100644
--- a/compiler/luci-interpreter/src/kernels/TransposeConv.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/TransposeConv.test.cpp
@@ -68,6 +68,8 @@ TEST(TransposeConvTest, FloatSimple)
       /*output_data=*/{29, 62, 83, 75, 99, 192, 237, 198, 207, 372, 417, 330, 263, 446, 485, 365},
       /*params.padding=*/luci::Padding::SAME, /*stride_height=*/1, /*stride_width=*/1,
       getElementType<float>());
+
+  SUCCEED();
 }
 
 TEST(TransposeConvTest, FloatTwoFiltersTest)
@@ -82,21 +84,18 @@ TEST(TransposeConvTest, FloatTwoFiltersTest)
                        3352, 3652, 2760},
       /*params.padding=*/luci::Padding::SAME, /*stride_height=*/1, /*stride_width=*/1,
       getElementType<float>());
-}
 
-TEST(TransposeConvTest, Uint8Simple)
-{
-  // TODO
-  // Implement GetDequantizedOutput Function.
-  // Create Test for Uint8 Case
-}
-TEST(TransposeConvTest, Uint8FiltersTest)
-{
-  // TODO
-  // Implement GetDequantizedOutput Function.
-  // Create Test for Uint8 Case
+  SUCCEED();
 }
 
+// TODO Uint8Simple
+// Implement GetDequantizedOutput Function.
+// Create Test for Uint8 Case
+
+// TODO Uint8FiltersTest
+// Implement GetDequantizedOutput Function.
+// Create Test for Uint8 Case
+
 } // namespace
 } // namespace kernels
 } // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/CMakeLists.txt b/compiler/luci-interpreter/src/loader/CMakeLists.txt
index fb36c4a..d99485d 100644
--- a/compiler/luci-interpreter/src/loader/CMakeLists.txt
+++ b/compiler/luci-interpreter/src/loader/CMakeLists.txt
@@ -1,3 +1,5 @@
+nnas_find_package(GTest REQUIRED)
+
 set(SOURCES
     GraphLoader.h
     GraphLoader.cpp
@@ -13,3 +15,8 @@ target_include_directories(luci_interpreter_loader PUBLIC "${LUCI_INTERPRETER_SO
 target_link_libraries(luci_interpreter_loader
     PUBLIC luci_lang luci_interpreter_core
     PRIVATE luci_interpreter_kernels nncc_common)
+
+set(TEST_SOURCES KernelBuilder.test.cpp)
+
+GTest_AddTest(luci_interpreter_loader_test ${TEST_SOURCES})
+target_link_libraries(luci_interpreter_loader_test luci_interpreter_loader)
diff --git a/compiler/luci-interpreter/src/loader/GraphLoader.cpp b/compiler/luci-interpreter/src/loader/GraphLoader.cpp
index 779fa06..6ebf979 100644
--- a/compiler/luci-interpreter/src/loader/GraphLoader.cpp
+++ b/compiler/luci-interpreter/src/loader/GraphLoader.cpp
@@ -16,7 +16,6 @@
 
 #include "loader/GraphLoader.h"
 
-#include "loader/ModuleLoader.h"
 #include "loader/KernelBuilder.h"
 
 #include <loco/IR/Algorithm.h>
@@ -71,6 +70,7 @@ bool isExecutableNode(const luci::CircleNode *node)
   {
     // These nodes denote inputs / outputs of a graph.
     case luci::CircleOpcode::CONST:
+    case luci::CircleOpcode::CIRCLECONST:
     case luci::CircleOpcode::CIRCLEINPUT:
     case luci::CircleOpcode::CIRCLEOUTPUT:
     // The following nodes denote outputs of multiple-output nodes.
@@ -102,11 +102,12 @@ bool isTensorProducingNode(const luci::CircleNode *node)
 
 } // namespace
 
-GraphLoader::GraphLoader(const ModuleLoader &module_loader, const loco::Graph *graph,
-                         RuntimeGraph *runtime_graph, RuntimeToIR &runtime_to_ir,
-                         std::unordered_map<const loco::Node *, Tensor *> &node_to_tensor)
-    : _module_loader(module_loader), _graph(graph), _runtime_graph(runtime_graph),
-      _runtime_to_ir(runtime_to_ir), _node_to_tensor(node_to_tensor)
+GraphLoader::GraphLoader(
+    const loco::Graph *graph, RuntimeGraph *runtime_graph, RuntimeToIR &runtime_to_ir,
+    const std::unordered_map<const loco::Graph *, RuntimeGraph *> &graph_to_runtime_graph,
+    std::unordered_map<const loco::Node *, Tensor *> &node_to_tensor)
+    : _graph(graph), _runtime_graph(runtime_graph), _runtime_to_ir(runtime_to_ir),
+      _graph_to_runtime_graph(graph_to_runtime_graph), _node_to_tensor(node_to_tensor)
 {
 }
 
@@ -136,6 +137,7 @@ void GraphLoader::loadTensors()
       const luci::CircleQuantParam *params = node->quantparam();
       quantization.scale.assign(params->scale.cbegin(), params->scale.cend());
       quantization.zero_point.assign(params->zerop.cbegin(), params->zerop.cend());
+      quantization.quantized_dimension = params->quantized_dimension;
     }
 
     auto tensor = std::make_unique<Tensor>(node->dtype(), std::move(shape), std::move(quantization),
@@ -178,7 +180,7 @@ void GraphLoader::initInputOutputTensors() const
 
 void GraphLoader::loadOperators()
 {
-  KernelBuilder kernel_builder(_module_loader, *this);
+  KernelBuilder kernel_builder(_graph_to_runtime_graph, _node_to_tensor);
 
   // Create kernels for executable nodes. This has to be done in execution order.
   for (const loco::Node *loco_node :
@@ -195,11 +197,4 @@ void GraphLoader::loadOperators()
   }
 }
 
-void GraphLoader::load()
-{
-  loadTensors();
-  initInputOutputTensors();
-  loadOperators();
-}
-
 } // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/GraphLoader.h b/compiler/luci-interpreter/src/loader/GraphLoader.h
index e0adc0f..89c5bca 100644
--- a/compiler/luci-interpreter/src/loader/GraphLoader.h
+++ b/compiler/luci-interpreter/src/loader/GraphLoader.h
@@ -27,29 +27,23 @@
 namespace luci_interpreter
 {
 
-class ModuleLoader;
-
 class GraphLoader
 {
 public:
-  GraphLoader(const ModuleLoader &module_loader, const loco::Graph *graph,
-              RuntimeGraph *runtime_graph, RuntimeToIR &runtime_to_ir,
+  GraphLoader(const loco::Graph *graph, RuntimeGraph *runtime_graph, RuntimeToIR &runtime_to_ir,
+              const std::unordered_map<const loco::Graph *, RuntimeGraph *> &graph_to_runtime_graph,
               std::unordered_map<const loco::Node *, Tensor *> &node_to_tensor);
 
-  void load();
-
-  Tensor *getTensorForNode(const loco::Node *node) const { return _node_to_tensor.at(node); }
-
-private:
-  void loadOperators();
-  void initInputOutputTensors() const;
   void loadTensors();
+  void initInputOutputTensors() const;
+  void loadOperators();
 
-  const ModuleLoader &_module_loader;
+private:
   const loco::Graph *_graph;
   RuntimeGraph *_runtime_graph;
   RuntimeToIR &_runtime_to_ir;
 
+  const std::unordered_map<const loco::Graph *, RuntimeGraph *> &_graph_to_runtime_graph;
   std::unordered_map<const loco::Node *, Tensor *> &_node_to_tensor;
 };
 
diff --git a/compiler/luci-interpreter/src/loader/KernelBuilder.cpp b/compiler/luci-interpreter/src/loader/KernelBuilder.cpp
index 56da961..c19f897 100644
--- a/compiler/luci-interpreter/src/loader/KernelBuilder.cpp
+++ b/compiler/luci-interpreter/src/loader/KernelBuilder.cpp
@@ -21,6 +21,7 @@
 #include "kernels/AveragePool2D.h"
 #include "kernels/Concatenation.h"
 #include "kernels/Conv2D.h"
+#include "kernels/DepthToSpace.h"
 #include "kernels/DepthwiseConv2D.h"
 #include "kernels/Elu.h"
 #include "kernels/FullyConnected.h"
@@ -35,6 +36,8 @@
 #include "kernels/Mul.h"
 #include "kernels/Pad.h"
 #include "kernels/Reshape.h"
+#include "kernels/Reverse.h"
+#include "kernels/Slice.h"
 #include "kernels/Softmax.h"
 #include "kernels/SpaceToDepth.h"
 #include "kernels/Split.h"
@@ -43,8 +46,6 @@
 #include "kernels/Unpack.h"
 #include "kernels/Transpose.h"
 #include "kernels/TransposeConv.h"
-#include "loader/GraphLoader.h"
-#include "loader/ModuleLoader.h"
 
 #include <stdexcept>
 
@@ -68,7 +69,7 @@ static std::vector<const loco::Node *> collectOutputNodes(const luci::CircleNode
 
 const Tensor *KernelBuilder::getInputTensor(const loco::Node *node) const
 {
-  const Tensor *tensor = _graph_loader.getTensorForNode(node);
+  const Tensor *tensor = _node_to_tensor.at(node);
   assert(tensor != nullptr);
   return tensor;
 }
@@ -81,7 +82,7 @@ const Tensor *KernelBuilder::getOptionalInputTensor(const loco::Node *node) cons
 
 Tensor *KernelBuilder::getOutputTensor(const loco::Node *node) const
 {
-  Tensor *tensor = _graph_loader.getTensorForNode(node);
+  Tensor *tensor = _node_to_tensor.at(node);
   assert(tensor != nullptr);
   return tensor;
 }
@@ -98,7 +99,7 @@ KernelBuilder::getOutputTensors(const std::vector<const loco::Node *> &nodes) co
 
 RuntimeGraph *KernelBuilder::getRuntimeGraph(const loco::Graph *graph) const
 {
-  RuntimeGraph *runtime_graph = _module_loader.getRuntimeGraph(graph);
+  RuntimeGraph *runtime_graph = _graph_to_runtime_graph.at(graph);
   assert(runtime_graph != nullptr);
   return runtime_graph;
 }
@@ -120,14 +121,14 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleAdd *node)
 std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleArgMax *node)
 {
   assert(node->arity() == 2);
-  const Tensor *input1 = getInputTensor(node->input());
-  const Tensor *input2 = getInputTensor(node->dimension());
+  const Tensor *input = getInputTensor(node->input());
+  const Tensor *axis = getInputTensor(node->dimension());
   Tensor *output = getOutputTensor(node);
 
   ArgMaxParams params{};
   params.output_type = node->output_type();
 
-  return std::make_unique<kernels::ArgMax>(input1, input2, output, params);
+  return std::make_unique<kernels::ArgMax>(input, axis, output, params);
 }
 
 std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleAveragePool2D *node)
@@ -188,6 +189,19 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleConv2D *node)
   return std::make_unique<kernels::Conv2D>(input, filter, bias, output, params);
 }
 
+std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleDepthToSpace *node)
+{
+  assert(node->arity() == 1);
+
+  const Tensor *input = getInputTensor(node->input());
+  Tensor *output = getOutputTensor(node);
+
+  DepthToSpaceParams params{};
+  params.block_size = node->block_size();
+
+  return std::make_unique<kernels::DepthToSpace>(input, output, params);
+}
+
 std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleDepthwiseConv2D *node)
 {
   assert(node->arity() == 3);
@@ -224,14 +238,14 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleFullyConnected *n
   assert(node->arity() == 3);
 
   const Tensor *input = getInputTensor(node->input());
-  const Tensor *filter = getInputTensor(node->weights());
+  const Tensor *weights = getInputTensor(node->weights());
   const Tensor *bias = getOptionalInputTensor(node->bias());
   Tensor *output = getOutputTensor(node);
 
   FullyConnectedParams params{};
   params.activation = node->fusedActivationFunction();
 
-  return std::make_unique<kernels::FullyConnected>(input, filter, bias, output, params);
+  return std::make_unique<kernels::FullyConnected>(input, weights, bias, output, params);
 }
 
 std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleIf *node)
@@ -255,6 +269,11 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleIf *node)
                                        else_graph);
 }
 
+std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleInput *)
+{
+  throw std::runtime_error("Input node cannot be executed.");
+}
+
 std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleL2Normalize *node)
 {
   assert(node->arity() == 1);
@@ -323,11 +342,6 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleLogistic *node)
   return std::make_unique<kernels::Logistic>(input, output);
 }
 
-std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleInput *)
-{
-  throw std::runtime_error("Input node cannot be executed.");
-}
-
 std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleMaxPool2D *node)
 {
   assert(node->arity() == 1);
@@ -402,6 +416,30 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleReshape *node)
   return std::make_unique<kernels::Reshape>(input, shape, output);
 }
 
+std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleReverseV2 *node)
+{
+  assert(node->arity() == 2);
+
+  const Tensor *input = getInputTensor(node->tensor());
+  const Tensor *axes = getInputTensor(node->axis());
+  Tensor *output = getOutputTensor(node);
+
+  return std::make_unique<kernels::Reverse>(input, axes, output);
+}
+
+std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleSlice *node)
+{
+  assert(node->arity() == 3);
+
+  const Tensor *input = getInputTensor(node->input());
+  const Tensor *begin = getInputTensor(node->begin());
+  const Tensor *size = getInputTensor(node->size());
+
+  Tensor *output = getOutputTensor(node);
+
+  return std::make_unique<kernels::Slice>(input, begin, size, output);
+}
+
 std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleSoftmax *node)
 {
   assert(node->arity() == 1);
@@ -442,6 +480,19 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleSplit *node)
   return std::make_unique<kernels::Split>(axis, input, std::move(outputs));
 }
 
+std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleSqueeze *node)
+{
+  assert(node->arity() == 1);
+
+  const Tensor *input = getInputTensor(node->input());
+  Tensor *output = getOutputTensor(node);
+
+  SqueezeParams params{};
+  params.squeeze_dims = node->squeeze_dims();
+
+  return std::make_unique<kernels::Squeeze>(input, output, params);
+}
+
 std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleStridedSlice *node)
 {
   assert(node->arity() == 4);
@@ -463,21 +514,15 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleStridedSlice *nod
   return std::make_unique<kernels::StridedSlice>(input, begin, end, strides, output, params);
 }
 
-std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleSqueeze *node)
+std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleTranspose *node)
 {
-  assert(node->arity() == 1);
+  assert(node->arity() == 2);
 
-  const Tensor *input = getInputTensor(node->input());
+  const Tensor *input = getInputTensor(node->a());
+  const Tensor *perm = getInputTensor(node->perm());
   Tensor *output = getOutputTensor(node);
 
-  SqueezeParams params{};
-  assert(node->squeeze_dims().size() <= 4);
-  for (size_t i = 0; i < node->squeeze_dims().size(); i++)
-  {
-    params.squeeze_dims.push_back(node->squeeze_dims().at(i));
-  }
-
-  return std::make_unique<kernels::Squeeze>(input, output, params);
+  return std::make_unique<kernels::Transpose>(input, perm, output);
 }
 
 std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleTransposeConv *node)
@@ -515,15 +560,4 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleUnpack *node)
   return std::make_unique<kernels::Unpack>(input, std::move(outputs), params);
 }
 
-std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleTranspose *node)
-{
-  assert(node->arity() == 2);
-
-  const Tensor *input = getInputTensor(node->a());
-  const Tensor *perm = getInputTensor(node->perm());
-  Tensor *output = getOutputTensor(node);
-
-  return std::make_unique<kernels::Transpose>(input, perm, output);
-}
-
 } // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/KernelBuilder.h b/compiler/luci-interpreter/src/loader/KernelBuilder.h
index 7e30d39..d5c5a4b 100644
--- a/compiler/luci-interpreter/src/loader/KernelBuilder.h
+++ b/compiler/luci-interpreter/src/loader/KernelBuilder.h
@@ -24,18 +24,18 @@
 
 #include <memory>
 #include <vector>
+#include <unordered_map>
 
 namespace luci_interpreter
 {
 
-class GraphLoader;
-class ModuleLoader;
-
 class KernelBuilder : public luci::CircleNodeVisitor<std::unique_ptr<Kernel>>
 {
 public:
-  KernelBuilder(const ModuleLoader &module_loader, const GraphLoader &graph_loader)
-      : _module_loader(module_loader), _graph_loader(graph_loader)
+  KernelBuilder(
+      const std::unordered_map<const loco::Graph *, RuntimeGraph *> &graph_to_runtime_graph,
+      const std::unordered_map<const loco::Node *, Tensor *> &node_to_tensor)
+      : _graph_to_runtime_graph(graph_to_runtime_graph), _node_to_tensor(node_to_tensor)
   {
   }
 
@@ -45,6 +45,7 @@ public:
   std::unique_ptr<Kernel> visit(const luci::CircleConcatenation *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleConv2D *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleConst *node) override;
+  std::unique_ptr<Kernel> visit(const luci::CircleDepthToSpace *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleDepthwiseConv2D *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleElu *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleFullyConnected *node) override;
@@ -61,6 +62,8 @@ public:
   std::unique_ptr<Kernel> visit(const luci::CircleOutput *node) override;
   std::unique_ptr<Kernel> visit(const luci::CirclePad *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleReshape *node) override;
+  std::unique_ptr<Kernel> visit(const luci::CircleReverseV2 *node) override;
+  std::unique_ptr<Kernel> visit(const luci::CircleSlice *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleSoftmax *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleSpaceToDepth *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleSplit *node) override;
@@ -82,8 +85,8 @@ private:
   RuntimeGraph *getRuntimeGraph(const loco::Graph *graph) const;
 
 private:
-  const ModuleLoader &_module_loader;
-  const GraphLoader &_graph_loader;
+  const std::unordered_map<const loco::Graph *, RuntimeGraph *> &_graph_to_runtime_graph;
+  const std::unordered_map<const loco::Node *, Tensor *> &_node_to_tensor;
 };
 
 } // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/KernelBuilder.test.cpp b/compiler/luci-interpreter/src/loader/KernelBuilder.test.cpp
new file mode 100644
index 0000000..33bc8ec
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/KernelBuilder.test.cpp
@@ -0,0 +1,743 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "loader/GraphLoader.h"
+#include "loader/KernelBuilder.h"
+
+#include <kernels/Add.h>
+#include <kernels/ArgMax.h>
+#include <kernels/AveragePool2D.h>
+#include <kernels/Concatenation.h>
+#include <kernels/Conv2D.h>
+#include <kernels/DepthToSpace.h>
+#include <kernels/DepthwiseConv2D.h>
+#include <kernels/Elu.h>
+#include <kernels/FullyConnected.h>
+#include <kernels/L2Normalize.h>
+#include <kernels/L2Pool2D.h>
+#include <kernels/LeakyRelu.h>
+#include <kernels/LocalResponseNormalization.h>
+#include <kernels/Logistic.h>
+#include <kernels/MaxPool2D.h>
+#include <kernels/Mean.h>
+#include <kernels/Mul.h>
+#include <kernels/Pad.h>
+#include <kernels/Reshape.h>
+#include <kernels/Reverse.h>
+#include <kernels/Slice.h>
+#include <kernels/Softmax.h>
+#include <kernels/SpaceToDepth.h>
+#include <kernels/Split.h>
+#include <kernels/Squeeze.h>
+#include <kernels/StridedSlice.h>
+#include <kernels/Transpose.h>
+#include <kernels/TransposeConv.h>
+#include <kernels/Unpack.h>
+
+#include <gmock/gmock.h>
+
+namespace luci_interpreter
+{
+namespace
+{
+
+using namespace testing;
+
+class KernelBuilderTest : public Test
+{
+protected:
+  luci::CircleInput *createInputNode() { return createNode<luci::CircleInput>(); }
+
+  template <typename NodeT, typename... Args> NodeT *createNode(Args &&... args)
+  {
+    auto *node = _graph.nodes()->create<NodeT>(std::forward<Args>(args)...);
+    // The actual type does not matter for the purpose of the tests.
+    // NOTE The type is meaningless for nodes with multiple outputs (corresponding *Out nodes carry
+    //  actual output types).
+    node->dtype(loco::DataType::FLOAT32);
+    return node;
+  }
+
+  template <typename NodeOutT> NodeOutT *createNodeOut(loco::Node *node, int index)
+  {
+    auto *node_out = createNode<NodeOutT>();
+    node_out->input(node);
+    node_out->index(index);
+    return node_out;
+  }
+
+  template <typename KernelT> std::unique_ptr<KernelT> buildKernel(const luci::CircleNode *op)
+  {
+    std::unordered_map<const loco::Graph *, RuntimeGraph *> graph_to_runtime_graph;
+
+    RuntimeGraph runtime_graph(nullptr);
+    RuntimeToIR runtime_to_ir;
+    GraphLoader graph_loader(&_graph, &runtime_graph, runtime_to_ir, graph_to_runtime_graph,
+                             _node_to_tensor);
+    graph_loader.loadTensors();
+
+    KernelBuilder kernel_builder(graph_to_runtime_graph, _node_to_tensor);
+
+    auto kernel = op->accept(&kernel_builder);
+    return std::unique_ptr<KernelT>(dynamic_cast<KernelT *>(kernel.release()));
+  }
+
+  void checkTensor(const Tensor *tensor, const loco::Node *node)
+  {
+    EXPECT_THAT(tensor, Eq(_node_to_tensor.at(node)));
+  }
+
+private:
+  loco::Graph _graph;
+  std::unordered_map<const loco::Node *, Tensor *> _node_to_tensor;
+};
+
+TEST_F(KernelBuilderTest, Add)
+{
+  auto *input1 = createInputNode();
+  auto *input2 = createInputNode();
+
+  auto *op = createNode<luci::CircleAdd>();
+  op->x(input1);
+  op->y(input2);
+
+  op->fusedActivationFunction(luci::FusedActFunc::RELU);
+
+  auto kernel = buildKernel<kernels::Add>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input1(), input1);
+  checkTensor(kernel->input2(), input2);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
+}
+
+TEST_F(KernelBuilderTest, ArgMax)
+{
+  auto *input = createInputNode();
+  auto *axis = createInputNode();
+
+  auto *op = createNode<luci::CircleArgMax>();
+  op->input(input);
+  op->dimension(axis);
+
+  op->output_type(loco::DataType::FLOAT32);
+
+  auto kernel = buildKernel<kernels::ArgMax>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->axis(), axis);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().output_type, Eq(op->output_type()));
+}
+
+TEST_F(KernelBuilderTest, AveragePool2D)
+{
+  auto *input = createInputNode();
+
+  auto *op = createNode<luci::CircleAveragePool2D>();
+  op->value(input);
+
+  op->padding(luci::Padding::SAME);
+  op->filter()->h(11);
+  op->filter()->w(13);
+  op->stride()->h(17);
+  op->stride()->w(19);
+  op->fusedActivationFunction(luci::FusedActFunc::RELU);
+
+  auto kernel = buildKernel<kernels::AveragePool2D>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().padding, Eq(op->padding()));
+  EXPECT_THAT(kernel->params().filter_height, Eq(op->filter()->h()));
+  EXPECT_THAT(kernel->params().filter_width, Eq(op->filter()->w()));
+  EXPECT_THAT(kernel->params().stride_height, Eq(op->stride()->h()));
+  EXPECT_THAT(kernel->params().stride_width, Eq(op->stride()->w()));
+  EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
+}
+
+TEST_F(KernelBuilderTest, Concatenation)
+{
+  auto *input1 = createInputNode();
+  auto *input2 = createInputNode();
+
+  auto *op = createNode<luci::CircleConcatenation>(2);
+  op->values(0, input1);
+  op->values(1, input2);
+  op->axis(11);
+
+  auto kernel = buildKernel<kernels::Concatenation>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(0), input1);
+  checkTensor(kernel->input(1), input2);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().axis, Eq(op->axis()));
+}
+
+TEST_F(KernelBuilderTest, Conv2D)
+{
+  auto *input = createInputNode();
+  auto *filter = createInputNode();
+  auto *bias = createInputNode();
+
+  auto *op = createNode<luci::CircleConv2D>();
+  op->input(input);
+  op->filter(filter);
+  op->bias(bias);
+
+  op->padding(luci::Padding::SAME);
+  op->stride()->h(11);
+  op->stride()->w(13);
+  op->dilation()->h(17);
+  op->dilation()->w(19);
+  op->fusedActivationFunction(luci::FusedActFunc::RELU);
+
+  auto kernel = buildKernel<kernels::Conv2D>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->filter(), filter);
+  checkTensor(kernel->bias(), bias);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().padding, Eq(op->padding()));
+  EXPECT_THAT(kernel->params().stride_height, Eq(op->stride()->h()));
+  EXPECT_THAT(kernel->params().stride_width, Eq(op->stride()->w()));
+  EXPECT_THAT(kernel->params().dilation_height_factor, Eq(op->dilation()->h()));
+  EXPECT_THAT(kernel->params().dilation_width_factor, Eq(op->dilation()->w()));
+  EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
+}
+
+TEST_F(KernelBuilderTest, DepthToSpace)
+{
+  auto *input = createInputNode();
+
+  auto *op = createNode<luci::CircleDepthToSpace>();
+  op->input(input);
+
+  op->block_size(11);
+
+  auto kernel = buildKernel<kernels::DepthToSpace>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().block_size, Eq(op->block_size()));
+}
+
+TEST_F(KernelBuilderTest, DepthwiseConv2D)
+{
+  auto *input = createInputNode();
+  auto *filter = createInputNode();
+  auto *bias = createInputNode();
+
+  auto *op = createNode<luci::CircleDepthwiseConv2D>();
+  op->input(input);
+  op->filter(filter);
+  op->bias(bias);
+
+  op->padding(luci::Padding::SAME);
+  op->depthMultiplier(11);
+  op->stride()->h(13);
+  op->stride()->w(17);
+  op->dilation()->h(19);
+  op->dilation()->w(23);
+  op->fusedActivationFunction(luci::FusedActFunc::RELU);
+
+  auto kernel = buildKernel<kernels::DepthwiseConv2D>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->filter(), filter);
+  checkTensor(kernel->bias(), bias);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().padding, Eq(op->padding()));
+  EXPECT_THAT(kernel->params().depth_multiplier, Eq(op->depthMultiplier()));
+  EXPECT_THAT(kernel->params().stride_height, Eq(op->stride()->h()));
+  EXPECT_THAT(kernel->params().stride_width, Eq(op->stride()->w()));
+  EXPECT_THAT(kernel->params().dilation_height_factor, Eq(op->dilation()->h()));
+  EXPECT_THAT(kernel->params().dilation_width_factor, Eq(op->dilation()->w()));
+  EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
+}
+
+TEST_F(KernelBuilderTest, Elu)
+{
+  auto *input = createInputNode();
+
+  auto *op = createNode<luci::CircleElu>();
+  op->features(input);
+
+  auto kernel = buildKernel<kernels::Elu>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, FullyConnected)
+{
+  auto *input = createInputNode();
+  auto *weights = createInputNode();
+  auto *bias = createInputNode();
+
+  auto *op = createNode<luci::CircleFullyConnected>();
+  op->input(input);
+  op->weights(weights);
+  op->bias(bias);
+
+  op->fusedActivationFunction(luci::FusedActFunc::RELU);
+
+  auto kernel = buildKernel<kernels::FullyConnected>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->weights(), weights);
+  checkTensor(kernel->bias(), bias);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
+}
+
+TEST_F(KernelBuilderTest, L2Normalize)
+{
+  auto *input = createInputNode();
+
+  auto *op = createNode<luci::CircleL2Normalize>();
+  op->x(input);
+
+  op->fusedActivationFunction(luci::FusedActFunc::RELU);
+
+  auto kernel = buildKernel<kernels::L2Normalize>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
+}
+
+TEST_F(KernelBuilderTest, L2Pool2D)
+{
+  auto *input = createInputNode();
+
+  auto *op = createNode<luci::CircleL2Pool2D>();
+  op->value(input);
+
+  op->padding(luci::Padding::SAME);
+  op->filter()->h(11);
+  op->filter()->w(13);
+  op->stride()->h(17);
+  op->stride()->w(19);
+  op->fusedActivationFunction(luci::FusedActFunc::RELU);
+
+  auto kernel = buildKernel<kernels::L2Pool2D>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().padding, Eq(op->padding()));
+  EXPECT_THAT(kernel->params().filter_height, Eq(op->filter()->h()));
+  EXPECT_THAT(kernel->params().filter_width, Eq(op->filter()->w()));
+  EXPECT_THAT(kernel->params().stride_height, Eq(op->stride()->h()));
+  EXPECT_THAT(kernel->params().stride_width, Eq(op->stride()->w()));
+  EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
+}
+
+TEST_F(KernelBuilderTest, LeakyRelu)
+{
+  auto *input = createInputNode();
+
+  auto *op = createNode<luci::CircleLeakyRelu>();
+  op->features(input);
+
+  op->alpha(11.0f);
+
+  auto kernel = buildKernel<kernels::LeakyRelu>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().alpha, Eq(op->alpha()));
+}
+
+TEST_F(KernelBuilderTest, LocalResponseNormalization)
+{
+  auto *input = createInputNode();
+
+  auto *op = createNode<luci::CircleLocalResponseNormalization>();
+  op->input(input);
+
+  op->radius(11);
+  op->bias(13.0f);
+  op->alpha(15.0f);
+  op->beta(17.0f);
+
+  auto kernel = buildKernel<kernels::LocalResponseNormalization>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().radius, Eq(op->radius()));
+  EXPECT_THAT(kernel->params().bias, Eq(op->bias()));
+  EXPECT_THAT(kernel->params().alpha, Eq(op->alpha()));
+  EXPECT_THAT(kernel->params().beta, Eq(op->beta()));
+}
+
+TEST_F(KernelBuilderTest, Logistic)
+{
+  auto *input = createInputNode();
+
+  auto *op = createNode<luci::CircleLogistic>();
+  op->x(input);
+
+  auto kernel = buildKernel<kernels::Logistic>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, MaxPool2D)
+{
+  auto *input = createInputNode();
+
+  auto *op = createNode<luci::CircleMaxPool2D>();
+  op->value(input);
+
+  op->padding(luci::Padding::SAME);
+  op->filter()->h(11);
+  op->filter()->w(13);
+  op->stride()->h(17);
+  op->stride()->w(19);
+  op->fusedActivationFunction(luci::FusedActFunc::RELU);
+
+  auto kernel = buildKernel<kernels::MaxPool2D>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().padding, Eq(op->padding()));
+  EXPECT_THAT(kernel->params().filter_height, Eq(op->filter()->h()));
+  EXPECT_THAT(kernel->params().filter_width, Eq(op->filter()->w()));
+  EXPECT_THAT(kernel->params().stride_height, Eq(op->stride()->h()));
+  EXPECT_THAT(kernel->params().stride_width, Eq(op->stride()->w()));
+  EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
+}
+
+TEST_F(KernelBuilderTest, Mean)
+{
+  auto *input = createInputNode();
+  auto *axes = createInputNode();
+
+  auto *op = createNode<luci::CircleMean>();
+  op->input(input);
+  op->reduction_indices(axes);
+
+  op->keep_dims(true);
+
+  auto kernel = buildKernel<kernels::Mean>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->axes(), axes);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().keep_dims, Eq(op->keep_dims()));
+}
+
+TEST_F(KernelBuilderTest, Mul)
+{
+  auto *input1 = createInputNode();
+  auto *input2 = createInputNode();
+
+  auto *op = createNode<luci::CircleMul>();
+  op->x(input1);
+  op->y(input2);
+
+  op->fusedActivationFunction(luci::FusedActFunc::RELU);
+
+  auto kernel = buildKernel<kernels::Mul>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input1(), input1);
+  checkTensor(kernel->input2(), input2);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
+}
+
+TEST_F(KernelBuilderTest, Pad)
+{
+  auto *input = createInputNode();
+  auto *paddings = createInputNode();
+
+  auto *op = createNode<luci::CirclePad>();
+  op->input(input);
+  op->paddings(paddings);
+
+  auto kernel = buildKernel<kernels::Pad>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->paddings(), paddings);
+  checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, Reshape)
+{
+  auto *input = createInputNode();
+  auto *shape = createInputNode();
+
+  auto *op = createNode<luci::CircleReshape>();
+  op->tensor(input);
+  op->shape(shape);
+
+  auto kernel = buildKernel<kernels::Reshape>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->shape(), shape);
+  checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, ReverseV2)
+{
+  auto *input = createInputNode();
+  auto *axes = createInputNode();
+
+  auto *op = createNode<luci::CircleReverseV2>();
+  op->tensor(input);
+  op->axis(axes);
+
+  auto kernel = buildKernel<kernels::Reverse>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->axes(), axes);
+  checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, Slice)
+{
+  auto *input = createInputNode();
+  auto *begin = createInputNode();
+  auto *size = createInputNode();
+
+  auto *op = createNode<luci::CircleSlice>();
+  op->input(input);
+  op->begin(begin);
+  op->size(size);
+
+  auto kernel = buildKernel<kernels::Slice>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->begin(), begin);
+  checkTensor(kernel->size(), size);
+  checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, Softmax)
+{
+  auto *input = createInputNode();
+
+  auto *op = createNode<luci::CircleSoftmax>();
+  op->logits(input);
+
+  op->beta(11.0f);
+
+  auto kernel = buildKernel<kernels::Softmax>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().beta, Eq(op->beta()));
+}
+
+TEST_F(KernelBuilderTest, SpaceToDepth)
+{
+  auto *input = createInputNode();
+
+  auto *op = createNode<luci::CircleSpaceToDepth>();
+  op->input(input);
+
+  op->block_size(11);
+
+  auto kernel = buildKernel<kernels::SpaceToDepth>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().block_size, op->block_size());
+}
+
+TEST_F(KernelBuilderTest, Split)
+{
+  auto *axis = createInputNode();
+  auto *input = createInputNode();
+  auto *op = createNode<luci::CircleSplit>();
+  auto *output1 = createNodeOut<luci::CircleSplitOut>(op, 0);
+  auto *output2 = createNodeOut<luci::CircleSplitOut>(op, 1);
+
+  op->split_dim(axis);
+  op->input(input);
+
+  op->num_split(2);
+
+  auto kernel = buildKernel<kernels::Split>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->axis(), axis);
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(0), output1);
+  checkTensor(kernel->output(1), output2);
+}
+
+TEST_F(KernelBuilderTest, Squeeze)
+{
+  auto *input = createInputNode();
+
+  auto *op = createNode<luci::CircleSqueeze>();
+  op->input(input);
+
+  op->squeeze_dims({11, 13});
+
+  auto kernel = buildKernel<kernels::Squeeze>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().squeeze_dims, ElementsAreArray(op->squeeze_dims()));
+}
+
+TEST_F(KernelBuilderTest, StridedSlice)
+{
+  auto *input = createInputNode();
+  auto *begin = createInputNode();
+  auto *end = createInputNode();
+  auto *strides = createInputNode();
+
+  auto *op = createNode<luci::CircleStridedSlice>();
+  op->input(input);
+  op->begin(begin);
+  op->end(end);
+  op->strides(strides);
+
+  op->begin_mask(11);
+  op->ellipsis_mask(13);
+  op->end_mask(17);
+  op->new_axis_mask(19);
+  op->shrink_axis_mask(23);
+
+  auto kernel = buildKernel<kernels::StridedSlice>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->begin(), begin);
+  checkTensor(kernel->end(), end);
+  checkTensor(kernel->strides(), strides);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().begin_mask, Eq(op->begin_mask()));
+  EXPECT_THAT(kernel->params().ellipsis_mask, Eq(op->ellipsis_mask()));
+  EXPECT_THAT(kernel->params().end_mask, Eq(op->end_mask()));
+  EXPECT_THAT(kernel->params().new_axis_mask, Eq(op->new_axis_mask()));
+  EXPECT_THAT(kernel->params().shrink_axis_mask, Eq(op->shrink_axis_mask()));
+}
+
+TEST_F(KernelBuilderTest, Transpose)
+{
+  auto *input = createInputNode();
+  auto *perm = createInputNode();
+
+  auto *op = createNode<luci::CircleTranspose>();
+  op->a(input);
+  op->perm(perm);
+
+  auto kernel = buildKernel<kernels::Transpose>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->perm(), perm);
+  checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, TransposeConv)
+{
+  auto *output_shape = createInputNode();
+  auto *filter = createInputNode();
+  auto *input = createInputNode();
+
+  auto *op = createNode<luci::CircleTransposeConv>();
+  op->inputSizes(output_shape);
+  op->filter(filter);
+  op->outBackprop(input);
+
+  op->padding(luci::Padding::SAME);
+  op->stride()->h(11);
+  op->stride()->w(13);
+
+  auto kernel = buildKernel<kernels::TransposeConv>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->output_shape(), output_shape);
+  checkTensor(kernel->filter(), filter);
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().padding, Eq(op->padding()));
+  EXPECT_THAT(kernel->params().stride_height, Eq(op->stride()->h()));
+  EXPECT_THAT(kernel->params().stride_width, Eq(op->stride()->w()));
+}
+
+TEST_F(KernelBuilderTest, Unpack)
+{
+  auto *input = createInputNode();
+  auto *op = createNode<luci::CircleUnpack>();
+  auto *output1 = createNodeOut<luci::CircleUnpackOut>(op, 0);
+  auto *output2 = createNodeOut<luci::CircleUnpackOut>(op, 1);
+
+  op->value(input);
+
+  op->num(2);
+  op->axis(11);
+
+  auto kernel = buildKernel<kernels::Unpack>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(0), output1);
+  checkTensor(kernel->output(1), output2);
+  EXPECT_THAT(kernel->params().axis, Eq(op->axis()));
+}
+
+TEST_F(KernelBuilderTest, NonExisting1_NEG)
+{
+  auto *op = createNode<luci::CircleConst>();
+  ASSERT_ANY_THROW(buildKernel<Kernel>(op));
+}
+
+TEST_F(KernelBuilderTest, NonExisting2_NEG)
+{
+  auto *op = createNode<luci::CircleInput>();
+  ASSERT_ANY_THROW(buildKernel<Kernel>(op));
+}
+
+TEST_F(KernelBuilderTest, NonExisting3_NEG)
+{
+  auto *op = createNode<luci::CircleOutput>();
+  ASSERT_ANY_THROW(buildKernel<Kernel>(op));
+}
+
+} // namespace
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/ModuleLoader.cpp b/compiler/luci-interpreter/src/loader/ModuleLoader.cpp
index 7780a61..b9a2ae0 100644
--- a/compiler/luci-interpreter/src/loader/ModuleLoader.cpp
+++ b/compiler/luci-interpreter/src/loader/ModuleLoader.cpp
@@ -41,8 +41,11 @@ void ModuleLoader::load()
   {
     const loco::Graph *graph = _module->graph(i);
     RuntimeGraph *runtime_graph = _graph_to_runtime_graph.at(graph);
-    GraphLoader loader(*this, graph, runtime_graph, _runtime_to_ir, _node_to_tensor);
-    loader.load();
+    GraphLoader loader(graph, runtime_graph, _runtime_to_ir, _graph_to_runtime_graph,
+                       _node_to_tensor);
+    loader.loadTensors();
+    loader.initInputOutputTensors();
+    loader.loadOperators();
   }
 }
 
diff --git a/compiler/luci-interpreter/src/loader/ModuleLoader.h b/compiler/luci-interpreter/src/loader/ModuleLoader.h
index 954dbfb..1af0ed7 100644
--- a/compiler/luci-interpreter/src/loader/ModuleLoader.h
+++ b/compiler/luci-interpreter/src/loader/ModuleLoader.h
@@ -36,11 +36,6 @@ public:
 
   void load();
 
-  RuntimeGraph *getRuntimeGraph(const loco::Graph *graph) const
-  {
-    return _graph_to_runtime_graph.at(graph);
-  }
-
 private:
   const luci::Module *_module;
   RuntimeModule *_runtime_module;
diff --git a/compiler/luci-value-test/evalverify.sh b/compiler/luci-value-test/evalverify.sh
index dfd55a6..12c9a45 100755
--- a/compiler/luci-value-test/evalverify.sh
+++ b/compiler/luci-value-test/evalverify.sh
@@ -4,8 +4,10 @@
 #
 # HOW TO USE
 #
-# ./evalverify.sh <path/to/work_dir> <TEST 1> <TEST 2> ...
-# work_dir : build directory of luci-value-test (ex: build/compiler/luci-value-test)
+# ./evalverify.sh <path/to/bin_dir> <path/to/work_dir> <path/to/venv_dir> <TEST 1> <TEST 2> ...
+# bin_dir  : build directory of luci-value-test (ex: build/compiler/luci-value-test)
+# work_dir : artifacts directoy where test materials exist
+# venv_dir : python virtual environment home directory
 
 VERIFY_SOURCE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 VERIFY_SCRIPT_PATH="${VERIFY_SOURCE_PATH}/luci_eval_verifier.py"
diff --git a/compiler/luci-value-test/test.lst b/compiler/luci-value-test/test.lst
index 6a332f9..364d881 100644
--- a/compiler/luci-value-test/test.lst
+++ b/compiler/luci-value-test/test.lst
@@ -1,6 +1,8 @@
 #addeval(Abs_000)
 addeval(Add_000)
+#addeval(Add_001)
 addeval(Add_U8_000)
+#addeval(AddN_000)
 #addeval(ArgMax_000)
 #addeval(ArgMax_001)
 #addeval(ArgMax_002)
@@ -9,73 +11,173 @@ addeval(Add_U8_000)
 #addeval(ArgMax_U8_001)
 #addeval(ArgMax_U8_002)
 #addeval(ArgMax_U8_003)
+#addeval(ArgMin_000)
+#addeval(ArgMin_001)
+#addeval(ArgMin_002)
+#addeval(ArgMin_003)
+#addeval(ArgMin_U8_000)
+#addeval(ArgMin_U8_001)
+#addeval(ArgMin_U8_002)
+#addeval(ArgMin_U8_003)
 addeval(AveragePool2D_000)
+#addeval(BatchMatMul_000)
 #addeval(BatchMatMulV2_000)
 #addeval(BatchMatMulV2_001)
 #addeval(BatchToSpaceND_000)
 #addeval(Cast_000)
+#addeval(Cast_001)
+#addeval(Ceil_000)
 addeval(Concatenation_000)
 addeval(Concatenation_U8_000)
 addeval(Conv2D_000)
 addeval(Conv2D_001)
 addeval(Conv2D_002)
+#addeval(Conv2D_003)
 addeval(Conv2D_U8_000)
 addeval(Conv2D_U8_001)
 #addeval(Cos_000)
+#addeval(DepthToSpace_000)
 addeval(DepthwiseConv2D_000)
 addeval(DepthwiseConv2D_U8_000)
+#addeval(DepthwiseConv2D_U8_001)
+addeval(DepthwiseConv2D_001)
 #addeval(Div_000)
+#addeval(ELU_000)
 #addeval(Equal_000)
 #addeval(Exp_000)
+#addeval(ExpandDims_000)
+#addeval(ExpandDims_001)
+#addeval(ExpandDims_002)
+#addeval(ExpandDims_003)
+#addeval(Fill_000)
+#addeval(Fill_001)
+#addeval(Floor_000)
+#addeval(FloorDiv_000)
+#addeval(FloorDiv_001)
+#addeval(FloorMod_000)
+#addeval(FloorMod_001)
 addeval(FullyConnected_000)
 addeval(FullyConnected_001)
 #addeval(FullyConnected_002)
 #addeval(FullyConnected_U8_000)
 #addeval(Gather_000)
+#addeval(GatherNd_000)
+#addeval(Greater_000)
+#addeval(GreaterEqual_000)
 #addeval(If_000)
 #addeval(If_001)
+addeval(L2Normalize_000)
+addeval(L2Pool2D_000)
+#addeval(L2Pool2D_U8_000)
+#addeval(LeakyRelu_000)
+#addeval(Less_000)
+#addeval(LessEqual_000)
+#addeval(LocalResponseNormalization_000)
+#addeval(Log_000)
+#addeval(LogicalAnd_000)
 #addeval(LogicalNot_000)
 #addeval(LogicalOr_000)
-#addeval(Logistic_000)
+addeval(Logistic_000)
+#addeval(LogSoftmax_000)
+#addeval(MatMul_000)
+#addeval(MatrixDiag_000)
+#addeval(MatrixSetDiag_000)
+#addeval(Maximum_000)
 addeval(MaxPool2D_000)
 addeval(MaxPool2D_U8_000)
 addeval(Mean_000)
 addeval(Mean_001)
 addeval(Mean_U8_000)
+#addeval(Minimum_000)
+#addeval(MirrorPad_000)
 addeval(Mul_000)
 #addeval(Mul_U8_000)
+#addeval(Neg_000)
+#addeval(NotEqual_000)
+#addeval(OneHot_000)
+#addeval(OneHot_001)
+#addeval(OneHot_002)
+#addeval(OneHot_003)
 #addeval(Pack_000)
 #addeval(Pack_U8_000)
 addeval(Pad_000)
 addeval(Pad_U8_000)
+#addeval(Pow_000)
+#addeval(PRelu_000)
+#addeval(Range_000)
+#addeval(Rank_000)
+#addeval(ReduceAny_000)
+#addeval(ReduceAny_001)
+#addeval(ReduceAny_002)
+#addeval(ReduceAny_003)
+#addeval(ReduceMax_000)
+#addeval(ReduceMin_000)
 #addeval(ReduceProd_000)
 #addeval(ReduceProd_001)
 #addeval(ReduceProd_002)
 #addeval(ReduceProd_003)
 #addeval(ReLU_000)
+#addeval(ReLU6_000)
+#addeval(ReLUN1To1_000)
 addeval(Reshape_000)
 addeval(Reshape_001)
 addeval(Reshape_002)
 #addeval(Reshape_003)
 addeval(Reshape_U8_000)
+#addeval(ResizeBilinear_000)
+#addeval(ResizeNearestNeighbor_000)
+#addeval(ReverseSequence_000)
+#addeval(ReverseV2_000)
+#addeval(Round_000)
 #addeval(Rsqrt_000)
+#addeval(ScatterNd_000)
+#addeval(SegmentSum_000)
+#addeval(Select_000)
+#addeval(Select_001)
+#addeval(Select_002)
+#addeval(SelectV2_000)
+#addeval(SelectV2_001)
+#addeval(SelectV2_002)
+#addeval(Shape_000)
 #addeval(Sin_000)
+addeval(Slice_000)
 addeval(Softmax_000)
 #addeval(Softmax_U8_000)
 #addeval(SpaceToBatchND_000)
 #addeval(SpaceToBatchND_001)
 #addeval(SpaceToBatchND_002)
 #addeval(SpaceToBatchND_003)
-#addeval(StridedSlice_000)
-#addeval(StridedSlice_001)
+#addeval(SpaceToDepth_000)
+#addeval(SparseToDense_000)
+#addeval(Split_000)
+#addeval(SplitV_000)
+#addeval(Sqrt_000)
+#addeval(Square_000)
+#addeval(SquaredDifference_000)
+addeval(Squeeze_000)
+addeval(StridedSlice_000)
+addeval(StridedSlice_001)
+addeval(StridedSlice_002)
 #addeval(Sub_000)
 #addeval(Sub_U8_000)
+#addeval(Sum_000)
+#addeval(Sum_001)
 #addeval(Tanh_000)
 #addeval(Tile_000)
 #addeval(Tile_U8_000)
-#addeval(Transpose_000)
+#addeval(TopKV2_000)
+#addeval(TopKV2_001)
+addeval(Transpose_000)
+#addeval(TransposeConv_000)
 #addeval(Unpack_000)
 #addeval(Unpack_001)
 #addeval(Unpack_002)
+addeval(Unpack_003)
+#addeval(Where_000)
+#addeval(Where_001)
 #addeval(While_000)
 #addeval(While_001)
+#addeval(While_002)
+#addeval(While_003)
+#addeval(YUV_TO_RGB_U8_000)
+#addeval(ZerosLike_000)
diff --git a/compiler/luci/export/src/CircleOperationExporter.cpp b/compiler/luci/export/src/CircleOperationExporter.cpp
index 3c01b67..344c99f 100644
--- a/compiler/luci/export/src/CircleOperationExporter.cpp
+++ b/compiler/luci/export/src/CircleOperationExporter.cpp
@@ -890,7 +890,7 @@ void OperationExporter::visit(luci::CircleSpaceToDepth *node)
 {
   export_simple(node, circle::BuiltinOperator_SPACE_TO_DEPTH,
                 circle::BuiltinOptions_SpaceToDepthOptions,
-                CreateSpaceToDepthOptions(builder).Union());
+                CreateSpaceToDepthOptions(builder, node->block_size()).Union());
 }
 
 void OperationExporter::visit(luci::CircleSparseToDense *node)
diff --git a/compiler/luci/export/src/CircleTensorExporter.cpp b/compiler/luci/export/src/CircleTensorExporter.cpp
index 5cad392..dc8c2fb 100644
--- a/compiler/luci/export/src/CircleTensorExporter.cpp
+++ b/compiler/luci/export/src/CircleTensorExporter.cpp
@@ -302,7 +302,10 @@ encodeQuantizationParameters(FlatBufferBuilder &builder, luci::CircleQuantParam
     scale = builder.CreateVector(quantparam->scale);
     zero_point = builder.CreateVector(quantparam->zerop);
   }
-  return circle::CreateQuantizationParameters(builder, min, max, scale, zero_point);
+  // Note: QuantizationDetails is not supported
+  return circle::CreateQuantizationParameters(builder, min, max, scale, zero_point,
+                                              circle::QuantizationDetails::QuantizationDetails_NONE,
+                                              0, quantparam->quantized_dimension);
 }
 
 void exportOpDefinedTensor(const CircleTensoInfo &info, FlatBufferBuilder &builder,
diff --git a/compiler/luci/import/src/CircleReader.cpp b/compiler/luci/import/src/CircleReader.cpp
index 81e945d..bc7f397 100644
--- a/compiler/luci/import/src/CircleReader.cpp
+++ b/compiler/luci/import/src/CircleReader.cpp
@@ -156,6 +156,7 @@ luci_quantparam(const circle::QuantizationParametersT *quantization)
   const auto &max = quantization->max;
   const auto &scale = quantization->scale;
   const auto &zero_point = quantization->zero_point;
+  const auto &quantized_dimension = quantization->quantized_dimension;
 
   if ((!min.empty() && !max.empty()) || (!scale.empty() && !zero_point.empty()))
   {
@@ -165,6 +166,7 @@ luci_quantparam(const circle::QuantizationParametersT *quantization)
     quantparam->max = max;
     quantparam->scale = scale;
     quantparam->zerop = zero_point;
+    quantparam->quantized_dimension = quantized_dimension;
 
     return quantparam;
   }
diff --git a/compiler/luci/import/src/Importer.test.cpp b/compiler/luci/import/src/Importer.test.cpp
index 4426e15..8366546 100644
--- a/compiler/luci/import/src/Importer.test.cpp
+++ b/compiler/luci/import/src/Importer.test.cpp
@@ -20,4 +20,9 @@
 
 #include <gtest/gtest.h>
 
-TEST(TensorFlowLiteImport, Dummy) { luci::Importer import; }
+TEST(TensorFlowLiteImport, Dummy)
+{
+  luci::Importer import;
+
+  SUCCEED();
+}
diff --git a/compiler/luci/import/src/Nodes/CircleLogistic.cpp b/compiler/luci/import/src/Nodes/CircleLogistic.cpp
index 85e7e55..c77c55e 100644
--- a/compiler/luci/import/src/Nodes/CircleLogistic.cpp
+++ b/compiler/luci/import/src/Nodes/CircleLogistic.cpp
@@ -32,21 +32,7 @@ bool CircleLogisticGraphBuilder::validate(const ValidateArgs &args) const
   if (outputs.size() != 1)
     return false;
 
-  // Must be one of the following types
-  // float16, float32, float64, complex64, or complex128
   const auto &tensors = args.reader.tensors();
-  const auto &tensor = tensors.at(inputs[0]);
-  switch (tensor->type)
-  {
-    case circle::TensorType_FLOAT16:
-    case circle::TensorType_FLOAT32:
-    case circle::TensorType_FLOAT64:
-    case circle::TensorType_COMPLEX64:
-      break;
-    default:
-      return false;
-  }
-
   if (tensors.at(inputs[0])->type != tensors.at(outputs[0])->type)
     return false;
 
diff --git a/compiler/luci/import/src/Nodes/CircleTransposeConv.cpp b/compiler/luci/import/src/Nodes/CircleTransposeConv.cpp
index 7bdf46d..eb0956c 100644
--- a/compiler/luci/import/src/Nodes/CircleTransposeConv.cpp
+++ b/compiler/luci/import/src/Nodes/CircleTransposeConv.cpp
@@ -30,6 +30,24 @@ bool CircleTransposeConvGraphBuilder::validate(const ValidateArgs &args) const
   if (args.op.inputs.size() != 3)
     return false;
 
+  const auto &inputs = args.op.inputs;
+  const auto &tensors = args.reader.tensors();
+  const auto &filter_tensor = tensors.at(inputs[1]);
+  const auto &filter_shape = filter_tensor.get()->shape;
+  const auto &ifm_tensor = tensors.at(inputs[2]);
+  const auto &ifm_shape = ifm_tensor.get()->shape;
+
+  // ifm and filters must be 4-D tensor
+  if (ifm_shape.size() != 4)
+    return false;
+  if (filter_shape.size() != 4)
+    return false;
+
+  // input shape : [batch, height, width, in_channels]
+  // filters shape : [output_channels, height, weight, in_channels]
+  if (ifm_tensor.get()->shape.at(3) != filter_tensor.get()->shape.at(3))
+    return false;
+
   return true;
 }
 
diff --git a/compiler/luci/lang/include/luci/IR/CircleNodes.lst b/compiler/luci/lang/include/luci/IR/CircleNodes.lst
index 488dcfb..acd7921 100644
--- a/compiler/luci/lang/include/luci/IR/CircleNodes.lst
+++ b/compiler/luci/lang/include/luci/IR/CircleNodes.lst
@@ -120,6 +120,7 @@ CIRCLE_NODE(BCQ_FULLY_CONNECTED, luci::CircleBCQFullyConnected)
 CIRCLE_NODE(BCQ_GATHER, luci::CircleBCQGather)
 CIRCLE_NODE(INSTANCE_NORM, luci::CircleInstanceNorm)
 // Virtual node(s)
+CIRCLE_NODE(CIRCLECONST, void)
 CIRCLE_NODE(CIRCLEINPUT, luci::CircleInput)
 CIRCLE_NODE(CIRCLEOUTPUT, luci::CircleOutput)
 CIRCLE_NODE(CIRCLEOUTPUTDUMMY, luci::CircleOutputDummy)
diff --git a/compiler/luci/lang/include/luci/IR/CircleQuantParam.h b/compiler/luci/lang/include/luci/IR/CircleQuantParam.h
index 7253e65..6944373 100644
--- a/compiler/luci/lang/include/luci/IR/CircleQuantParam.h
+++ b/compiler/luci/lang/include/luci/IR/CircleQuantParam.h
@@ -29,6 +29,7 @@ struct CircleQuantParam
   std::vector<float> max;
   std::vector<float> scale;
   std::vector<int64_t> zerop;
+  int32_t quantized_dimension{0};
 };
 
 } // namespace luci
diff --git a/compiler/luci/lang/src/Module.test.cpp b/compiler/luci/lang/src/Module.test.cpp
index 26bf073..a5973e5 100644
--- a/compiler/luci/lang/src/Module.test.cpp
+++ b/compiler/luci/lang/src/Module.test.cpp
@@ -22,7 +22,7 @@ TEST(ModuleTest, consturctor)
 {
   auto gs = luci::make_module();
 
-  GTEST_SUCCEED();
+  SUCCEED();
 }
 
 TEST(ModuleTest, add)
diff --git a/compiler/luci/lang/src/Nodes/CircleCustom.test.cpp b/compiler/luci/lang/src/Nodes/CircleCustom.test.cpp
index 74ea82c..c07268c 100644
--- a/compiler/luci/lang/src/Nodes/CircleCustom.test.cpp
+++ b/compiler/luci/lang/src/Nodes/CircleCustom.test.cpp
@@ -35,7 +35,12 @@ TEST(CircleCustomTest, constructor)
   ASSERT_EQ(0, custom_node.custom_code().size());
 }
 
-TEST(CircleCustomTest, constructor_NEG) { ASSERT_DEBUG_DEATH(luci::CircleCustom{0}, ""); }
+TEST(CircleCustomTest, constructor_NEG)
+{
+  ASSERT_DEBUG_DEATH(luci::CircleCustom{0}, "");
+
+  SUCCEED();
+}
 
 TEST(CircleCustomTest, invalidIndex_NEG)
 {
diff --git a/compiler/luci/lang/src/Nodes/CircleIf.test.cpp b/compiler/luci/lang/src/Nodes/CircleIf.test.cpp
index e3c8c9f..35f28e9 100644
--- a/compiler/luci/lang/src/Nodes/CircleIf.test.cpp
+++ b/compiler/luci/lang/src/Nodes/CircleIf.test.cpp
@@ -41,11 +41,15 @@ TEST(CircleIfTest, constructor)
 TEST(CircleIfTestDeath, invalid_arity_NEG)
 {
   ASSERT_DEBUG_DEATH(luci::CircleIf very_long_name_if_node(0, 1), "");
+
+  SUCCEED();
 }
 
 TEST(CircleIfTestDeath, invalid_output_count_NEG)
 {
   ASSERT_DEBUG_DEATH(luci::CircleIf if_node(2, 0), "");
+
+  SUCCEED();
 }
 
 TEST(CircleIfTestDeath, invalid_input_get_index_NEG)
diff --git a/compiler/luci/lang/src/Nodes/CircleWhile.test.cpp b/compiler/luci/lang/src/Nodes/CircleWhile.test.cpp
index 19290c0..913686f 100644
--- a/compiler/luci/lang/src/Nodes/CircleWhile.test.cpp
+++ b/compiler/luci/lang/src/Nodes/CircleWhile.test.cpp
@@ -41,11 +41,15 @@ TEST(CircleWhileTest, constructor)
 TEST(CircleWhileTestDeath, invalid_arity_NEG)
 {
   ASSERT_DEBUG_DEATH(luci::CircleWhile very_long_name_while_node(0, 1), "");
+
+  SUCCEED();
 }
 
 TEST(CircleWhileTestDeath, invalid_output_count_NEG)
 {
   ASSERT_DEBUG_DEATH(luci::CircleWhile while_node(2, 0), "");
+
+  SUCCEED();
 }
 
 TEST(CircleWhileTestDeath, invalid_input_get_index_NEG)
diff --git a/compiler/luci/pass/src/CircleOptimizer.cpp b/compiler/luci/pass/src/CircleOptimizer.cpp
index 90fbe90..2edf7a9 100644
--- a/compiler/luci/pass/src/CircleOptimizer.cpp
+++ b/compiler/luci/pass/src/CircleOptimizer.cpp
@@ -145,7 +145,7 @@ void CircleOptimizer::quantize(loco::Graph *g) const
   {
     static const std::vector<std::string> fakeq_supported_input_dtype{"float32"};
     static const std::vector<std::string> fakeq_supported_output_dtype{"uint8"};
-    static const std::vector<std::string> fakeq_supported_granularity{"layer"};
+    static const std::vector<std::string> fakeq_supported_granularity{"layer", "channel"};
 
     auto input_dtype = _options->param(Options::AlgorithmParameters::Quantize_input_dtype);
     auto output_dtype = _options->param(Options::AlgorithmParameters::Quantize_output_dtype);
@@ -173,7 +173,7 @@ void CircleOptimizer::quantize(loco::Graph *g) const
   {
     static const std::vector<std::string> qwmm_supported_input_dtype{"float32"};
     static const std::vector<std::string> qwmm_supported_output_dtype{"uint8"};
-    static const std::vector<std::string> qwmm_supported_granularity{"layer"};
+    static const std::vector<std::string> qwmm_supported_granularity{"layer", "channel"};
 
     auto input_dtype = _options->param(Options::AlgorithmParameters::Quantize_input_dtype);
     auto output_dtype = _options->param(Options::AlgorithmParameters::Quantize_output_dtype);
diff --git a/compiler/luci/pass/src/FuseBCQPass.cpp b/compiler/luci/pass/src/FuseBCQPass.cpp
index b81db88..edbaefa 100644
--- a/compiler/luci/pass/src/FuseBCQPass.cpp
+++ b/compiler/luci/pass/src/FuseBCQPass.cpp
@@ -67,14 +67,190 @@ const std::string node_name_prefix(luci::NodeName node_name)
   return prefix;
 }
 
+/**
+ * @brief Create CircleOutputExclude operation, which has same shape and dtype with
+ *        original circle_node.
+ */
+luci::CircleOutputExclude *createNoOp(luci::CircleNode *circle_node)
+{
+  auto graph = circle_node->graph();
+  auto noOp = graph->nodes()->create<luci::CircleOutputExclude>();
+
+  if (circle_node->shape_status() == luci::ShapeStatus::VALID)
+  {
+    noOp->dtype(circle_node->dtype());
+    noOp->rank(circle_node->rank());
+    for (uint32_t i = 0; i < circle_node->rank(); ++i)
+      noOp->dim(i) = circle_node->dim(i);
+  }
+  else
+  {
+    // For type inference
+    noOp->dtype(loco::DataType::FLOAT32);
+  }
+
+  return noOp;
+};
+
 } // namespace
 
 namespace
 {
 
-class BCQConverter final
+// V means the version of BCQ.
+template <int32_t V> class BCQFuser;
+
+template <> class BCQFuser<1>
 {
 public:
+  bool fuseBCQ(loco::Graph *g)
+  {
+    bool changed = false;
+
+    for (auto node : loco::all_nodes(g))
+    {
+      if (auto circle_const = dynamic_cast<luci::CircleConst *>(node))
+      {
+        add_BCQ_info_node(circle_const);
+      }
+    }
+
+    if (!is_bcqinfo_valid())
+      return false;
+
+    for (auto node : loco::active_nodes(loco::output_nodes(g)))
+    {
+      if (auto gather = dynamic_cast<luci::CircleGather *>(node))
+      {
+        auto params = dynamic_cast<luci::CircleConst *>(gather->params());
+        if (params != nullptr && has_BCQ_info(params))
+        {
+          auto bcq_gather = g->nodes()->create<luci::CircleBCQGather>();
+
+          bcq_gather->op_version(1);
+          bcq_gather->input_scales(get_alpha(params));
+          bcq_gather->input_binary(get_packed_binary_code(params));
+          bcq_gather->indices(gather->indices());
+          bcq_gather->input_clusters(packed_clusters(params));
+
+          // input_binary shape : [output_size, hidden_size]
+          const auto binary_hidden_size =
+              loco::must_cast<luci::CircleConst *>(bcq_gather->input_binary())->dim(1).value() * 32;
+          bcq_gather->input_hidden_size(binary_hidden_size);
+
+          if (do_w_x(params))
+          {
+            bcq_gather->axis(gather->axis());
+          }
+          else
+          {
+            const auto axis_transpose = (gather->axis() == 0) ? 1 : 0;
+            bcq_gather->axis(axis_transpose);
+          }
+
+          loco::replace(gather).with(bcq_gather);
+
+          changed = true;
+        }
+      }
+      else if (auto fully_connected = dynamic_cast<luci::CircleFullyConnected *>(node))
+      {
+        auto weights = dynamic_cast<luci::CircleConst *>(fully_connected->weights());
+        if (weights != nullptr && has_BCQ_info(weights))
+        {
+          auto bcq_fc = g->nodes()->create<luci::CircleBCQFullyConnected>();
+
+          bcq_fc->op_version(1);
+          bcq_fc->weights_scales(get_alpha(weights));
+          bcq_fc->weights_binary(get_packed_binary_code(weights));
+          bcq_fc->bias(fully_connected->bias());
+          bcq_fc->weights_clusters(packed_clusters(weights));
+          bcq_fc->fusedActivationFunction(fully_connected->fusedActivationFunction());
+
+          loco::Node *bcq_input = fully_connected->input();
+          int32_t batch_rank = 0;
+
+          // If input of BCQFullyConnected has more than rank 2, we should reshape it as rank 2
+          const auto original_input = loco::must_cast<luci::CircleNode *>(fully_connected->input());
+          if (original_input->shape_status() == luci::ShapeStatus::VALID &&
+              original_input->rank() > 2)
+          {
+            auto new_shape = g->nodes()->create<luci::CircleConst>();
+            new_shape->dtype(loco::DataType::S32);
+            new_shape->size<loco::DataType::S32>(2);
+            new_shape->rank(1);
+            new_shape->dim(0) = 2;
+
+            auto batch_size = 1;
+            for (uint32_t i = 0; i < original_input->rank() - 1; ++i)
+              batch_size *= original_input->dim(i).value();
+
+            new_shape->at<loco::DataType::S32>(0) = batch_size;
+            new_shape->at<loco::DataType::S32>(1) =
+                original_input->dim(original_input->rank() - 1).value();
+            new_shape->shape_status(luci::ShapeStatus::VALID);
+
+            auto reshape = g->nodes()->create<luci::CircleReshape>();
+            reshape->tensor(original_input);
+            reshape->shape(new_shape);
+
+            bcq_input = reshape;
+            batch_rank = original_input->rank() - 2;
+          }
+
+          // If x_w formation, we should insert Transpose in front and back of BCQFullyConnected
+          if (do_w_x(weights))
+          {
+            const auto binary_hidden_size =
+                loco::must_cast<luci::CircleNode *>(fully_connected->input())
+                    ->dim(batch_rank)
+                    .value();
+            bcq_fc->weights_hidden_size(binary_hidden_size);
+            bcq_fc->input(bcq_input);
+            loco::replace(fully_connected).with(bcq_fc);
+          }
+          else
+          {
+            const auto binary_hidden_size =
+                loco::must_cast<luci::CircleNode *>(fully_connected->input())
+                    ->dim(1 + batch_rank)
+                    .value();
+            bcq_fc->weights_hidden_size(binary_hidden_size);
+
+            auto perm = g->nodes()->create<luci::CircleConst>();
+            perm->dtype(loco::DataType::S32);
+            perm->size<loco::DataType::S32>(2);
+            perm->rank(1);
+            perm->dim(0) = 2;
+            perm->at<loco::DataType::S32>(0) = 1;
+            perm->at<loco::DataType::S32>(1) = 0;
+            perm->shape_status(luci::ShapeStatus::VALID);
+
+            auto input_transpose = g->nodes()->create<luci::CircleTranspose>();
+            input_transpose->a(bcq_input);
+            input_transpose->perm(perm);
+
+            bcq_fc->input(input_transpose);
+
+            auto output_transpose = g->nodes()->create<luci::CircleTranspose>();
+            output_transpose->a(bcq_fc);
+            output_transpose->perm(perm);
+
+            loco::replace(fully_connected).with(output_transpose);
+          }
+
+          changed = true;
+        }
+      }
+    }
+
+    if (changed)
+      clear_BCQ_nodes();
+
+    return changed;
+  }
+
+private:
   void add_BCQ_info_node(luci::CircleConst *node)
   {
     const auto node_name = node->name();
@@ -119,16 +295,65 @@ public:
     return has_info;
   }
 
+  /**
+   * @brief Exclude BCQ information nodes which are used for fusing BCQ operations
+   *        from graph output by using CircleOutputExclude
+   */
+  void clear_BCQ_nodes()
+  {
+    auto clear_nodes = [](std::map<std::string, luci::CircleConst *> &nodes) {
+      for (auto &n : nodes)
+      {
+        auto node = n.second;
+
+        for (auto s : loco::succs(node))
+        {
+          if (auto outnode = dynamic_cast<luci::CircleOutput *>(s))
+          {
+            outnode->from(createNoOp(node));
+          }
+          else if (auto reshape_node = dynamic_cast<luci::CircleReshape *>(s))
+          {
+            for (auto o : loco::succs(reshape_node))
+            {
+              auto circle_output = loco::must_cast<luci::CircleOutput *>(o);
+              circle_output->from(createNoOp(reshape_node));
+            }
+          }
+        }
+      }
+    };
+
+    clear_nodes(_do_w_x);
+    clear_nodes(_alpha);
+    clear_nodes(_packed_binary_code);
+    clear_nodes(_number_of_clusters);
+    clear_nodes(_size_of_clusters);
+    clear_nodes(_qbits_of_clusters);
+    clear_nodes(_dequant_weight);
+  }
+
+  bool is_bcqinfo_valid()
+  {
+    // do_w_x should be int32 or bool type
+    for (auto n : _do_w_x)
+    {
+      if (n.second->dtype() != loco::DataType::BOOL && n.second->dtype() != loco::DataType::S32)
+        return false;
+    }
+
+    return true;
+  }
+
+private:
   bool do_w_x(luci::CircleConst *node)
   {
     const auto prefix = node_name_prefix(node->name());
 
     if (_do_w_x[prefix]->dtype() == loco::DataType::S32)
       return _do_w_x[prefix]->at<loco::DataType::S32>(0) == 1;
-    else if (_do_w_x[prefix]->dtype() == loco::DataType::BOOL)
-      return _do_w_x[prefix]->at<loco::DataType::BOOL>(0);
     else
-      throw std::runtime_error("do_w_x should be int or bool");
+      return _do_w_x[prefix]->at<loco::DataType::BOOL>(0);
   }
 
   luci::CircleConst *get_alpha(luci::CircleConst *node)
@@ -187,64 +412,6 @@ public:
     return packed_clusters;
   }
 
-  /**
-   * @brief Exclude BCQ information nodes which are used for fusing BCQ operations
-   *        from graph output by using CircleOutputExclude
-   */
-  void clear_BCQ_nodes()
-  {
-    auto createNoOp = [](luci::CircleNode *circle_node) {
-      auto graph = circle_node->graph();
-      auto noOp = graph->nodes()->create<luci::CircleOutputExclude>();
-
-      if (circle_node->shape_status() == luci::ShapeStatus::VALID)
-      {
-        noOp->dtype(circle_node->dtype());
-        noOp->rank(circle_node->rank());
-        for (uint32_t i = 0; i < circle_node->rank(); ++i)
-          noOp->dim(i) = circle_node->dim(i);
-      }
-      else
-      {
-        // For type inference
-        noOp->dtype(loco::DataType::FLOAT32);
-      }
-
-      return noOp;
-    };
-
-    auto clear_nodes = [createNoOp](std::map<std::string, luci::CircleConst *> &nodes) {
-      for (auto &n : nodes)
-      {
-        auto node = n.second;
-
-        for (auto s : loco::succs(node))
-        {
-          if (auto outnode = dynamic_cast<luci::CircleOutput *>(s))
-          {
-            outnode->from(createNoOp(node));
-          }
-          else if (auto reshape_node = dynamic_cast<luci::CircleReshape *>(s))
-          {
-            for (auto o : loco::succs(reshape_node))
-            {
-              auto circle_output = loco::must_cast<luci::CircleOutput *>(o);
-              circle_output->from(createNoOp(reshape_node));
-            }
-          }
-        }
-      }
-    };
-
-    clear_nodes(_do_w_x);
-    clear_nodes(_alpha);
-    clear_nodes(_packed_binary_code);
-    clear_nodes(_number_of_clusters);
-    clear_nodes(_size_of_clusters);
-    clear_nodes(_qbits_of_clusters);
-    clear_nodes(_dequant_weight);
-  }
-
 private:
   std::map<std::string, luci::CircleConst *> _do_w_x;
   std::map<std::string, luci::CircleConst *> _alpha;
@@ -262,142 +429,9 @@ namespace luci
 
 bool FuseBCQPass::run(loco::Graph *g)
 {
-  BCQConverter converter;
-
   bool changed = false;
 
-  for (auto node : loco::all_nodes(g))
-  {
-    if (auto circle_const = dynamic_cast<luci::CircleConst *>(node))
-    {
-      converter.add_BCQ_info_node(circle_const);
-    }
-  }
-
-  for (auto node : loco::active_nodes(loco::output_nodes(g)))
-  {
-    if (auto gather = dynamic_cast<luci::CircleGather *>(node))
-    {
-      auto params = dynamic_cast<luci::CircleConst *>(gather->params());
-      if (params != nullptr && converter.has_BCQ_info(params))
-      {
-        auto bcq_gather = g->nodes()->create<luci::CircleBCQGather>();
-
-        bcq_gather->input_scales(converter.get_alpha(params));
-        bcq_gather->input_binary(converter.get_packed_binary_code(params));
-        bcq_gather->indices(gather->indices());
-        bcq_gather->input_clusters(converter.packed_clusters(params));
-
-        const auto binary_hidden_size =
-            loco::must_cast<luci::CircleConst *>(bcq_gather->input_binary())->dim(1).value() * 32;
-        bcq_gather->input_hidden_size(binary_hidden_size);
-
-        if (converter.do_w_x(params))
-        {
-          bcq_gather->axis(gather->axis());
-        }
-        else
-        {
-          const auto axis_transpose = (gather->axis() == 0) ? 1 : 0;
-          bcq_gather->axis(axis_transpose);
-        }
-
-        loco::replace(gather).with(bcq_gather);
-
-        changed = true;
-      }
-    }
-    else if (auto fully_connected = dynamic_cast<luci::CircleFullyConnected *>(node))
-    {
-      auto weights = dynamic_cast<luci::CircleConst *>(fully_connected->weights());
-      if (weights != nullptr && converter.has_BCQ_info(weights))
-      {
-        auto bcq_fc = g->nodes()->create<luci::CircleBCQFullyConnected>();
-
-        bcq_fc->weights_scales(converter.get_alpha(weights));
-        bcq_fc->weights_binary(converter.get_packed_binary_code(weights));
-        bcq_fc->bias(fully_connected->bias());
-        bcq_fc->weights_clusters(converter.packed_clusters(weights));
-        bcq_fc->fusedActivationFunction(fully_connected->fusedActivationFunction());
-
-        loco::Node *bcq_input = fully_connected->input();
-        int32_t batch_rank = 0;
-
-        // If input of BCQFullyConnected has more than rank 2, we should reshape it as rank 2
-        const auto original_input = loco::must_cast<luci::CircleNode *>(fully_connected->input());
-        if (original_input->shape_status() == ShapeStatus::VALID && original_input->rank() > 2)
-        {
-          auto new_shape = g->nodes()->create<luci::CircleConst>();
-          new_shape->dtype(loco::DataType::S32);
-          new_shape->size<loco::DataType::S32>(2);
-          new_shape->rank(1);
-          new_shape->dim(0) = 2;
-
-          auto batch_size = 1;
-          for (uint32_t i = 0; i < original_input->rank() - 1; ++i)
-            batch_size *= original_input->dim(i).value();
-
-          new_shape->at<loco::DataType::S32>(0) = batch_size;
-          new_shape->at<loco::DataType::S32>(1) =
-              original_input->dim(original_input->rank() - 1).value();
-          new_shape->shape_status(ShapeStatus::VALID);
-
-          auto reshape = g->nodes()->create<luci::CircleReshape>();
-          reshape->tensor(original_input);
-          reshape->shape(new_shape);
-
-          bcq_input = reshape;
-          batch_rank = original_input->rank() - 2;
-        }
-
-        // If x_w formation, we should insert Transpose in front and back of BCQFullyConnected
-        if (converter.do_w_x(weights))
-        {
-          const auto binary_hidden_size =
-              loco::must_cast<luci::CircleNode *>(fully_connected->input())
-                  ->dim(batch_rank)
-                  .value();
-          bcq_fc->weights_hidden_size(binary_hidden_size);
-          bcq_fc->input(bcq_input);
-          loco::replace(fully_connected).with(bcq_fc);
-        }
-        else
-        {
-          const auto binary_hidden_size =
-              loco::must_cast<luci::CircleNode *>(fully_connected->input())
-                  ->dim(1 + batch_rank)
-                  .value();
-          bcq_fc->weights_hidden_size(binary_hidden_size);
-
-          auto perm = g->nodes()->create<luci::CircleConst>();
-          perm->dtype(loco::DataType::S32);
-          perm->size<loco::DataType::S32>(2);
-          perm->rank(1);
-          perm->dim(0) = 2;
-          perm->at<loco::DataType::S32>(0) = 1;
-          perm->at<loco::DataType::S32>(1) = 0;
-          perm->shape_status(ShapeStatus::VALID);
-
-          auto input_transpose = g->nodes()->create<luci::CircleTranspose>();
-          input_transpose->a(bcq_input);
-          input_transpose->perm(perm);
-
-          bcq_fc->input(input_transpose);
-
-          auto output_transpose = g->nodes()->create<luci::CircleTranspose>();
-          output_transpose->a(bcq_fc);
-          output_transpose->perm(perm);
-
-          loco::replace(fully_connected).with(output_transpose);
-        }
-
-        changed = true;
-      }
-    }
-  }
-
-  if (changed)
-    converter.clear_BCQ_nodes();
+  changed = BCQFuser<1>().fuseBCQ(g);
 
   return changed;
 }
diff --git a/compiler/luci/pass/src/QuantizationUtils.cpp b/compiler/luci/pass/src/QuantizationUtils.cpp
index 6726ce7..9c9e741 100644
--- a/compiler/luci/pass/src/QuantizationUtils.cpp
+++ b/compiler/luci/pass/src/QuantizationUtils.cpp
@@ -99,6 +99,13 @@ void compute_asym_scale_zp(float min, float max, float &scaling_factor, int64_t
     nudged_zero_point = static_cast<uint8_t>(std::round(zero_point_double));
   }
 
+  // protect scale from being very low due to overflow
+  if (scale < 1e-5)
+  {
+    scale = 1e-5;
+    nudged_zero_point = static_cast<uint8_t>(std::round(qmin_double - rmin / scale));
+  }
+
   nudged_min = static_cast<float>((qmin_double - nudged_zero_point) * scale);
   nudged_max = static_cast<float>((qmax_double - nudged_zero_point) * scale);
 
diff --git a/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp b/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp
index f8abee7..2264bd7 100644
--- a/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp
+++ b/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp
@@ -138,7 +138,8 @@ bool is_quantized(const CircleNode *node)
          node->dtype() == loco::DataType::S32;  // bias
 }
 
-void sym_wquant_per_channel(CircleConst *node, std::vector<float> &scaling_factor)
+void sym_wquant_per_channel(CircleConst *node, std::vector<float> &scaling_factor,
+                            int32_t &channel_dim_index)
 {
   assert(node->dtype() == loco::DataType::FLOAT32);
 
@@ -153,7 +154,6 @@ void sym_wquant_per_channel(CircleConst *node, std::vector<float> &scaling_facto
   uint32_t indices[4] = {
       0,
   };
-  int channel_dim_index{0};
 
   if (!get_channel_dim_index(node, dimension, channel_dim_index))
   {
@@ -189,7 +189,7 @@ void sym_wquant_per_channel(CircleConst *node, std::vector<float> &scaling_facto
 }
 
 void asym_wquant_per_channel(CircleConst *node, std::vector<float> &min,
-                             std::vector<float> &scaling_factor)
+                             std::vector<float> &scaling_factor, int32_t &channel_dim_index)
 {
   assert(node->dtype() == loco::DataType::FLOAT32);
 
@@ -204,7 +204,6 @@ void asym_wquant_per_channel(CircleConst *node, std::vector<float> &min,
   uint32_t indices[4] = {
       0,
   };
-  int channel_dim_index{0};
 
   if (!get_channel_dim_index(node, dimension, channel_dim_index))
   {
@@ -350,8 +349,8 @@ struct QuantizeActivation final : public luci::CircleNodeMutableVisitor<bool>
           circle_node->dtype(loco::DataType::S16);
         }
 
-        circle_node->quantparam()->max[0] = nudged_max;
-        circle_node->quantparam()->min[0] = nudged_min;
+        circle_node->quantparam()->min.clear();
+        circle_node->quantparam()->max.clear();
         circle_node->quantparam()->scale.push_back(scaling_factor);
         circle_node->quantparam()->zerop.push_back(zp);
       }
@@ -472,15 +471,19 @@ struct QuantizeWeights final : public luci::CircleNodeMutableVisitor<bool>
           assert(quantparam != nullptr);
           auto min = quantparam->min;
           auto scaling_factor = quantparam->scale;
+          int32_t channel_dim_index = 0;
 
           if (output_type == loco::DataType::U8)
           {
-            asym_wquant_per_channel(circle_const, min, scaling_factor);
+            asym_wquant_per_channel(circle_const, min, scaling_factor, channel_dim_index);
           }
           else
           {
-            sym_wquant_per_channel(circle_const, scaling_factor);
+            sym_wquant_per_channel(circle_const, scaling_factor, channel_dim_index);
           }
+          quantparam->min.clear();
+          quantparam->max.clear();
+          quantparam->quantized_dimension = channel_dim_index;
         }
         // Find min/max per layer-wise
         else
@@ -493,6 +496,8 @@ struct QuantizeWeights final : public luci::CircleNodeMutableVisitor<bool>
           auto min = quantparam->min[0];
           auto scaling_factor = quantparam->scale[0];
           asym_wquant_per_layer(circle_const, min, scaling_factor);
+          quantparam->min.clear();
+          quantparam->max.clear();
         }
       }
     }
diff --git a/compiler/luci/tests/test.lst b/compiler/luci/tests/test.lst
index 188e298..3da3437 100644
--- a/compiler/luci/tests/test.lst
+++ b/compiler/luci/tests/test.lst
@@ -30,13 +30,16 @@ addread(Ceil_000)
 addread(Concatenation_000)
 addread(Concatenation_U8_000)
 addread(Conv2D_000)
+addread(Conv2D_001)
 addread(Conv2D_002)
 addread(Conv2D_003)
 addread(Conv2D_U8_000)
+addread(Conv2D_U8_001)
 addread(Cos_000)
 addread(DepthToSpace_000)
 addread(DepthwiseConv2D_000)
 addread(DepthwiseConv2D_U8_000)
+addread(DepthwiseConv2D_U8_001)
 addread(DepthwiseConv2D_001)
 addread(Div_000)
 addread(ELU_000)
@@ -84,6 +87,7 @@ addread(MaxPool2D_000)
 addread(MaxPool2D_U8_000)
 addread(Mean_000)
 addread(Mean_001)
+addread(Mean_U8_000)
 addread(Minimum_000)
 addread(MirrorPad_000)
 addread(Mul_000)
@@ -97,6 +101,7 @@ addread(OneHot_003)
 addread(Pack_000)
 addread(Pack_U8_000)
 addread(Pad_000)
+addread(Pad_U8_000)
 addread(Pow_000)
 addread(PRelu_000)
 addread(Range_000)
@@ -222,13 +227,16 @@ addwrite(Ceil_000)
 addwrite(Concatenation_000)
 addwrite(Concatenation_U8_000)
 addwrite(Conv2D_000)
+addwrite(Conv2D_001)
 addwrite(Conv2D_002)
 addwrite(Conv2D_003)
 addwrite(Conv2D_U8_000)
+addwrite(Conv2D_U8_001)
 addwrite(Cos_000)
 addwrite(DepthToSpace_000)
 addwrite(DepthwiseConv2D_000)
 addwrite(DepthwiseConv2D_U8_000)
+addwrite(DepthwiseConv2D_U8_001)
 addwrite(DepthwiseConv2D_001)
 addwrite(Div_000)
 addwrite(ELU_000)
@@ -276,6 +284,7 @@ addwrite(MaxPool2D_000)
 addwrite(MaxPool2D_U8_000)
 addwrite(Mean_000)
 addwrite(Mean_001)
+addwrite(Mean_U8_000)
 addwrite(Minimum_000)
 addwrite(MirrorPad_000)
 addwrite(Mul_000)
diff --git a/compiler/one-cmds/one-codegen b/compiler/one-cmds/one-codegen
index 2c80664..820b6d8 100644
--- a/compiler/one-cmds/one-codegen
+++ b/compiler/one-cmds/one-codegen
@@ -18,7 +18,7 @@ DRIVER_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 
 function Usage()
 {
-  echo "Usage: $0 [BACKEND] ..."
+  echo "Usage: one-codegen [BACKEND] ..."
   echo "Available BACKEND drivers:"
   backend_exist=0
   for file in `find $DRIVER_PATH -name *-compile -type f`;
@@ -33,23 +33,34 @@ function Usage()
   if [ $backend_exist == 0 ]; then
     echo "  (There is no available backend drivers)"
   fi
+
+  exit 255
 }
 
-# Get command from command-line
-BACKEND=$1; shift
-BACKEND_DRIVER="$BACKEND-compile"
+function version()
+{
+  $DRIVER_PATH/one-version one-codegen
+  exit 255
+}
 
-if [[ -z "${BACKEND_DRIVER}" ]]; then
+# Get command from command-line
+BACKEND=$1
+if [[ -z ${BACKEND} ]]; then
   Usage
-  exit 255
 fi
+shift
+
+if [[ "${BACKEND}" == "--version" ]]; then
+  version
+fi
+
+BACKEND_DRIVER="${BACKEND}-compile"
 
 BACKEND_DRIVER_CMD="${DRIVER_PATH}/${BACKEND_DRIVER}"
 
 if [[ ! -f "${BACKEND_DRIVER_CMD}" ]]; then
   echo "ERROR: '${BACKEND_DRIVER}' is not supported"
   Usage
-  exit 255
 fi
 
 "${BACKEND_DRIVER_CMD}" "$@"
diff --git a/compiler/one-cmds/one-import b/compiler/one-cmds/one-import
index dbf4af5..b1dd8f4 100644
--- a/compiler/one-cmds/one-import
+++ b/compiler/one-cmds/one-import
@@ -18,7 +18,7 @@ DRIVER_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 
 function Usage()
 {
-  echo "Usage: $0 [FRAMEWORK] ..."
+  echo "Usage: one-import [FRAMEWORK] ..."
   echo "Available FRAMEWORK drivers:"
   framework_exist=0
   for file in "$DRIVER_PATH"/one-import-*;
@@ -31,23 +31,34 @@ function Usage()
   if [ $framework_exist == 0 ]; then
     echo "  (There is no available import drivers)"
   fi
+
+  exit 255
 }
 
-# Get command from command-line
-FRAMEWORK=$1; shift
-FRAMEWORK_DRIVER="one-import-$FRAMEWORK"
+function version()
+{
+  $DRIVER_PATH/one-version one-import-tf
+  exit 255
+}
 
-if [[ -z "${FRAMEWORK_DRIVER}" ]]; then
+# Get command from command-line
+FRAMEWORK=$1
+if [[ -z ${FRAMEWORK} ]]; then
   Usage
-  exit 255
+fi
+shift
+
+if [ ${FRAMEWORK} = "--version" ]; then
+  version
 fi
 
+FRAMEWORK_DRIVER="one-import-$FRAMEWORK"
+
 FRAMEWORK_DRIVER_CMD="${DRIVER_PATH}/${FRAMEWORK_DRIVER}"
 
 if [[ ! -f "${FRAMEWORK_DRIVER_CMD}" ]]; then
   echo "ERROR: '${FRAMEWORK_DRIVER}' is not supported"
   Usage
-  exit 255
 fi
 
 "${FRAMEWORK_DRIVER_CMD}" "$@"
diff --git a/compiler/one-cmds/one-import-tf b/compiler/one-cmds/one-import-tf
index c048a4e..d59e1c5 100644
--- a/compiler/one-cmds/one-import-tf
+++ b/compiler/one-cmds/one-import-tf
@@ -22,14 +22,24 @@ usage()
 {
   echo "Convert TensorFlow model to circle."
   echo "Usage: one-import-tf"
+  echo "    --version Show version information and exit"
   echo "    --input_path <path/to/tfmodel>"
   echo "    --output_path <path/to/circle>"
   echo "    --input_arrays <names of the input arrays, comma-separated>"
   echo "    --input_shapes <input shapes, colon-separated>"
   echo "    --output_arrays <names of the output arrays, comma-separated>"
-  exit 0
+  echo "    --v2 Use TensorFlow 2.x interface (default is 1.x interface)"
+  exit 255
 }
 
+version()
+{
+  $DRIVER_PATH/one-version one-import-tf
+  exit 255
+}
+
+TF_INTERFACE="--v1"
+
 # Parse command-line arguments
 #
 while [ "$#" -ne 0 ]; do
@@ -39,6 +49,9 @@ while [ "$#" -ne 0 ]; do
     '--help')
       usage
       ;;
+    '--version')
+      version
+      ;;
     '--input_path')
       export INPUT_PATH="$2"
       shift 2
@@ -59,6 +72,10 @@ while [ "$#" -ne 0 ]; do
       export OUTPUT_ARRAYS="$2"
       shift 2
       ;;
+    '--v2')
+      TF_INTERFACE="--v2"
+      shift
+      ;;
     *)
       echo "Unknown parameter: ${CUR}"
       shift
@@ -92,14 +109,21 @@ fi
 # remove previous log
 rm -rf "${OUTPUT_PATH}.log"
 
+show_err_onexit()
+{
+  cat "${OUTPUT_PATH}.log"
+}
+
+trap show_err_onexit ERR
+
 # generate temporary tflite file
-echo "python" "${DRIVER_PATH}/tf2tfliteV2.py" --v2 --input_path ${INPUT_PATH} \
+echo "python" "${DRIVER_PATH}/tf2tfliteV2.py" ${TF_INTERFACE} --input_path ${INPUT_PATH} \
 --input_arrays ${INPUT_ARRAYS} --input_shapes ${INPUT_SHAPES} \
 --output_path "${TMPDIR}/${MODEL_NAME}.tflite" \
 --output_arrays ${OUTPUT_ARRAYS} > "${OUTPUT_PATH}.log"
 echo " " >> "${OUTPUT_PATH}.log"
 
-python "${DRIVER_PATH}/tf2tfliteV2.py" --v2 --input_path ${INPUT_PATH} \
+python "${DRIVER_PATH}/tf2tfliteV2.py" ${TF_INTERFACE} --input_path ${INPUT_PATH} \
 --input_arrays ${INPUT_ARRAYS} --input_shapes ${INPUT_SHAPES} \
 --output_path "${TMPDIR}/${MODEL_NAME}.tflite" \
 --output_arrays ${OUTPUT_ARRAYS} >> "${OUTPUT_PATH}.log" 2>&1
diff --git a/compiler/one-cmds/one-import-tflite b/compiler/one-cmds/one-import-tflite
index 31ed5af..053489c 100644
--- a/compiler/one-cmds/one-import-tflite
+++ b/compiler/one-cmds/one-import-tflite
@@ -22,9 +22,16 @@ usage()
 {
   echo "Convert TensorFlow lite model to circle."
   echo "Usage: one-import-tflite"
+  echo "    --version Show version information and exit"
   echo "    --input_path <path/to/tflitemodel>"
   echo "    --output_path <path/to/circle>"
-  exit 0
+  exit 255
+}
+
+version()
+{
+  $DRIVER_PATH/one-version one-import-tflite
+  exit 255
 }
 
 # Parse command-line arguments
@@ -36,6 +43,9 @@ while [ "$#" -ne 0 ]; do
     '--help')
       usage
       ;;
+    '--version')
+      version
+      ;;
     '--input_path')
       export INPUT_PATH="$2"
       shift 2
@@ -55,12 +65,18 @@ if [ -z ${INPUT_PATH} ] || [ ! -e ${INPUT_PATH} ]; then
   echo "Error: input model not found"
   echo ""
   usage
-  exit 2
 fi
 
 # remove previous log
 rm -rf "${OUTPUT_PATH}.log"
 
+show_err_onexit()
+{
+  cat "${OUTPUT_PATH}.log"
+}
+
+trap show_err_onexit ERR
+
 # convert .tflite to .circle
 echo "${DRIVER_PATH}/tflite2circle" "${INPUT_PATH}" "${OUTPUT_PATH}" > "${OUTPUT_PATH}.log"
 
diff --git a/compiler/one-cmds/one-optimize b/compiler/one-cmds/one-optimize
index 95384c1..17b6b98 100644
--- a/compiler/one-cmds/one-optimize
+++ b/compiler/one-cmds/one-optimize
@@ -22,6 +22,7 @@ usage()
 {
   echo "Optimize circle model."
   echo "Usage: one-optimize"
+  echo "    --version       Show version information and exit"
   echo "    --all           Enable all optimization algorithms"
   echo "    --fuse_bcq      Enable FuseBCQ Pass"
   echo "    --fuse_instnorm Enable FuseInstanceNormalization Pass"
@@ -33,7 +34,13 @@ usage()
   echo "                    Enable ResolveCustomOpMatMulPass Pass"
   echo "    --input_path <path/to/input/circle>"
   echo "    --output_path <path/to/output/circle>"
-  exit 0
+  exit 255
+}
+
+version()
+{
+  $DRIVER_PATH/one-version one-optimize
+  exit 255
 }
 
 OPTIMIZE_all=0
@@ -52,6 +59,9 @@ while [ "$#" -ne 0 ]; do
     '--help')
       usage
       ;;
+    '--version')
+      version
+      ;;
     '--all')
       OPTIMIZE_all=1
       shift
@@ -96,7 +106,6 @@ if [ -z ${INPUT_PATH} ] || [ ! -e ${INPUT_PATH} ]; then
   echo "Error: input model not found"
   echo ""
   usage
-  exit 2
 fi
 
 OPTIMIZE_OPTIONS=""
@@ -123,6 +132,13 @@ fi
 # remove previous log
 rm -rf "${OUTPUT_PATH}.log"
 
+show_err_onexit()
+{
+  cat "${OUTPUT_PATH}.log"
+}
+
+trap show_err_onexit ERR
+
 # NOTE do not wrap ${OPTIMIZE_OPTIONS} with ""
 # optimize circle
 echo "${DRIVER_PATH}/circle2circle" ${OPTIMIZE_OPTIONS} \
diff --git a/compiler/one-cmds/one-pack b/compiler/one-cmds/one-pack
index 2bc4c60..9224b2c 100644
--- a/compiler/one-cmds/one-pack
+++ b/compiler/one-cmds/one-pack
@@ -22,9 +22,16 @@ usage()
 {
   echo "Package circle to nnpkg"
   echo "Usage: one-pack"
+  echo "    -v, --version Show version information and exit"
   echo "    -i <path/to/circle>"
   echo "    -o <path/to/nnpackage/folder>"
-  exit 0
+  exit 255
+}
+
+version()
+{
+  $DRIVER_PATH/one-version one-pack
+  exit 255
 }
 
 # Parse command-line arguments
@@ -36,6 +43,12 @@ while [ "$#" -ne 0 ]; do
     '--help')
       usage
       ;;
+    '-v')
+      version
+      ;;
+    '--version')
+      version
+      ;;
     '-i')
       export INPUT_PATH="$2"
       shift 2
@@ -55,12 +68,18 @@ if [ -z ${INPUT_PATH} ] || [ ! -e ${INPUT_PATH} ]; then
   echo "Error: input model not found"
   echo ""
   usage
-  exit 2
 fi
 
 # remove previous log
 rm -rf "${OUTPUT_PATH}.log"
 
+show_err_onexit()
+{
+  cat "${OUTPUT_PATH}.log"
+}
+
+trap show_err_onexit ERR
+
 # Package circle model file to nnpkg
 echo "${DRIVER_PATH}/model2nnpkg.sh" -o "${OUTPUT_PATH}" "${INPUT_PATH}" > "${OUTPUT_PATH}.log"
 
diff --git a/compiler/one-cmds/one-quantize b/compiler/one-cmds/one-quantize
index ff9e266..c74b2c2 100644
--- a/compiler/one-cmds/one-quantize
+++ b/compiler/one-cmds/one-quantize
@@ -22,16 +22,23 @@ usage()
 {
   echo "Quantize circle model."
   echo "Usage: one-quantize"
+  echo "    --version         Show version information and exit"
   echo "    --input_dtype     Input data type (supported: float32, default=float32)"
   echo "    --quantized_dtype Output quantized data type (supported: uint8, default=uint8)"
-  echo "    --granularity     Quantize granularity (supported: layer, default=layer)"
+  echo "    --granularity     Quantize granularity (supported: layer, channel, default=layer)"
   echo "    --min_percentile  Minimum percentile (0.0~100.0, default=1.0)"
   echo "    --max_percentile  Maximum percentile (0.0~100.0, default=99.0)"
   echo "    --mode            Record mode (supported: percentile/moving_average, default=percentile)"
   echo "    --input_path <path/to/input/circle>"
   echo "    --input_data <path/to/input/data>"
   echo "    --output_path <path/to/output/circle>"
-  exit 0
+  exit 255
+}
+
+version()
+{
+  $DRIVER_PATH/one-version one-quantize
+  exit 255
 }
 
 INPUT_DTYPE=float32
@@ -50,6 +57,9 @@ while [ "$#" -ne 0 ]; do
     '--help')
       usage
       ;;
+    '--version')
+      version
+      ;;
 
     '--input_dtype')
       INPUT_DTYPE="$2"
@@ -100,13 +110,11 @@ if [ -z ${INPUT_PATH} ] || [ ! -e ${INPUT_PATH} ]; then
   echo "Error: input model not found"
   echo ""
   usage
-  exit 2
 fi
 if [ -z ${INPUT_DATA} ] || [ ! -e ${INPUT_DATA} ]; then
   echo "Error: input data not found"
   echo ""
   usage
-  exit 2
 fi
 
 FILE_BASE=$(basename ${OUTPUT_PATH})
@@ -118,6 +126,13 @@ trap "{ rm -rf $TMPDIR; }" EXIT
 # remove previous log
 rm -rf "${OUTPUT_PATH}.log"
 
+show_err_onexit()
+{
+  cat "${OUTPUT_PATH}.log"
+}
+
+trap show_err_onexit ERR
+
 # quantize circle
 echo "${DRIVER_PATH}/circle-quantizer" \
 --quantize_dequantize_weights ${INPUT_DTYPE} ${QUANTIZED_DTYPE} ${GRANULARITY} \
diff --git a/compiler/one-cmds/requires.cmake b/compiler/one-cmds/requires.cmake
index 9b858ad..812149c 100644
--- a/compiler/one-cmds/requires.cmake
+++ b/compiler/one-cmds/requires.cmake
@@ -3,3 +3,4 @@ require("tflite2circle")
 require("circle2circle")
 require("circle-quantizer")
 require("record-minmax")
+require("vconone")
diff --git a/compiler/record-minmax/CMakeLists.txt b/compiler/record-minmax/CMakeLists.txt
index 862660e..f8a165b 100644
--- a/compiler/record-minmax/CMakeLists.txt
+++ b/compiler/record-minmax/CMakeLists.txt
@@ -19,9 +19,14 @@ target_link_libraries(record-minmax safemain)
 target_link_libraries(record-minmax luci_import)
 target_link_libraries(record-minmax luci_export)
 target_link_libraries(record-minmax luci_interpreter)
+target_link_libraries(record-minmax vconone)
 
 install(TARGETS record-minmax DESTINATION bin)
 
+if(NOT ENABLE_TEST)
+  return()
+endif(NOT ENABLE_TEST)
+
 nnas_find_package(GTest REQUIRED)
 GTest_AddTest(record_minmax_function_test "${CMAKE_CURRENT_SOURCE_DIR}/tests/RecordFunction.test.cpp")
 target_include_directories(record_minmax_function_test PRIVATE include)
diff --git a/compiler/record-minmax/driver/Driver.cpp b/compiler/record-minmax/driver/Driver.cpp
index ae4fcb7..8b09498 100644
--- a/compiler/record-minmax/driver/Driver.cpp
+++ b/compiler/record-minmax/driver/Driver.cpp
@@ -17,6 +17,13 @@
 #include "RecordMinMax.h"
 
 #include <arser/arser.h>
+#include <vconone/vconone.h>
+
+void print_version(void)
+{
+  std::cout << "record-minmax version " << vconone::get_string() << std::endl;
+  std::cout << vconone::get_copyright() << std::endl;
+}
 
 int entry(const int argc, char **argv)
 {
@@ -25,6 +32,13 @@ int entry(const int argc, char **argv)
   arser::Arser arser(
       "Embedding min/max values of activations to the circle model for post-training quantization");
 
+  arser.add_argument("--version")
+      .nargs(0)
+      .required(false)
+      .default_value(false)
+      .help("Show version information and exit")
+      .exit_with(print_version);
+
   arser.add_argument("--input_model")
       .nargs(1)
       .type(arser::DataType::STR)
@@ -66,7 +80,7 @@ int entry(const int argc, char **argv)
   {
     std::cout << err.what() << std::endl;
     std::cout << arser;
-    return 0;
+    return 255;
   }
 
   auto input_model_path = arser.get<std::string>("--input_model");
diff --git a/compiler/record-minmax/requires.cmake b/compiler/record-minmax/requires.cmake
index 0545035..f6804ce 100644
--- a/compiler/record-minmax/requires.cmake
+++ b/compiler/record-minmax/requires.cmake
@@ -1,3 +1,4 @@
 require("luci")
 require("safemain")
 require("arser")
+require("vconone")
diff --git a/compiler/record-minmax/src/HDF5Importer.cpp b/compiler/record-minmax/src/HDF5Importer.cpp
index cf30cd8..a0e65ee 100644
--- a/compiler/record-minmax/src/HDF5Importer.cpp
+++ b/compiler/record-minmax/src/HDF5Importer.cpp
@@ -20,6 +20,7 @@
 
 #include <string>
 #include <cassert>
+#include <stdexcept>
 
 using Shape = luci_interpreter::Shape;
 using DataType = luci_interpreter::DataType;
diff --git a/compiler/record-minmax/src/MinMaxObserver.cpp b/compiler/record-minmax/src/MinMaxObserver.cpp
index 45f0197..410ce3d 100644
--- a/compiler/record-minmax/src/MinMaxObserver.cpp
+++ b/compiler/record-minmax/src/MinMaxObserver.cpp
@@ -38,7 +38,8 @@ void MinMaxObserver::postTensorWrite(const luci::CircleNode *node,
   assert(node->opcode() != luci::CircleOpcode::UNPACK);
   assert(node->opcode() != luci::CircleOpcode::WHILE);
 
-  if (node->opcode() == luci::CircleOpcode::CONST)
+  if (node->opcode() == luci::CircleOpcode::CONST ||
+      node->opcode() == luci::CircleOpcode::CIRCLECONST)
   {
     // node is not activation. Do nothing.
     return;
diff --git a/compiler/record-minmax/src/RecordMinMax.cpp b/compiler/record-minmax/src/RecordMinMax.cpp
index d12a0d3..17c6aa6 100644
--- a/compiler/record-minmax/src/RecordMinMax.cpp
+++ b/compiler/record-minmax/src/RecordMinMax.cpp
@@ -158,7 +158,7 @@ void RecordMinMax::profileData(const std::string &mode, const std::string &input
     auto node = iter->first;
     auto minmax = iter->second;
 
-    float min, max;
+    float min{0.0f}, max{0.0f};
     if (mode == "percentile")
     {
       min = getNthPercentile(minmax.min_vector, min_percentile);
diff --git a/compiler/record-minmax/tests/RecordFunction.test.cpp b/compiler/record-minmax/tests/RecordFunction.test.cpp
index 13b464d..e2f135a 100644
--- a/compiler/record-minmax/tests/RecordFunction.test.cpp
+++ b/compiler/record-minmax/tests/RecordFunction.test.cpp
@@ -32,6 +32,8 @@ TEST(GetNthPercentileTest, Edge)
 
   EXPECT_FLOAT_NEAR(0, getNthPercentile(input, 0));
   EXPECT_FLOAT_NEAR(9, getNthPercentile(input, 100));
+
+  SUCCEED();
 }
 
 TEST(GetNthPercentileTest, Simple)
@@ -47,6 +49,8 @@ TEST(GetNthPercentileTest, Simple)
   {
     EXPECT_FLOAT_NEAR(0.09 * std::floor(i) + 0.045, getNthPercentile(input, i));
   }
+
+  SUCCEED();
 }
 
 TEST(GetNthPercentileTest, Float)
@@ -61,6 +65,8 @@ TEST(GetNthPercentileTest, Float)
   EXPECT_FLOAT_NEAR(2.799942346802177, getNthPercentile(input, 1));
   EXPECT_FLOAT_NEAR(7.768503955476342, getNthPercentile(input, 3.14));
   EXPECT_FLOAT_NEAR(99.40456084968194, getNthPercentile(input, 99));
+
+  SUCCEED();
 }
 
 TEST(GetNthPercentileTest, FloatWithNegative)
@@ -75,6 +81,8 @@ TEST(GetNthPercentileTest, FloatWithNegative)
   EXPECT_FLOAT_NEAR(-47.20005765319782, getNthPercentile(input, 1));
   EXPECT_FLOAT_NEAR(-42.23149604452366, getNthPercentile(input, 3.14));
   EXPECT_FLOAT_NEAR(49.40456084968194, getNthPercentile(input, 99));
+
+  SUCCEED();
 }
 
 TEST(GetNthPercentileTest, SigleElement)
@@ -84,6 +92,8 @@ TEST(GetNthPercentileTest, SigleElement)
   EXPECT_FLOAT_NEAR(33, getNthPercentile(input, 0));
   EXPECT_FLOAT_NEAR(33, getNthPercentile(input, 50));
   EXPECT_FLOAT_NEAR(33, getNthPercentile(input, 100));
+
+  SUCCEED();
 }
 
 TEST(GetNthPercentileTest, OutOfBoundary_NEG)
@@ -92,6 +102,8 @@ TEST(GetNthPercentileTest, OutOfBoundary_NEG)
 
   EXPECT_THROW(getNthPercentile(input, -1), std::runtime_error);
   EXPECT_THROW(getNthPercentile(input, 101), std::runtime_error);
+
+  SUCCEED();
 }
 
 TEST(GetNthPercentileTest, EmptyVector_NEG)
@@ -99,6 +111,8 @@ TEST(GetNthPercentileTest, EmptyVector_NEG)
   std::vector<float> input;
 
   EXPECT_THROW(getNthPercentile(input, 10), std::runtime_error);
+
+  SUCCEED();
 }
 
 } // namespace record_minmax
diff --git a/compiler/tfl-verify/CMakeLists.txt b/compiler/tfl-verify/CMakeLists.txt
index d33059f..4421a46 100644
--- a/compiler/tfl-verify/CMakeLists.txt
+++ b/compiler/tfl-verify/CMakeLists.txt
@@ -6,6 +6,7 @@ file(GLOB_RECURSE SOURCES "src/*.cpp")
 
 add_executable(tfl-verify ${SOURCES})
 target_include_directories(tfl-verify PRIVATE src)
+target_link_libraries(tfl-verify arser)
 target_link_libraries(tfl-verify foder)
 target_link_libraries(tfl-verify mio_tflite)
 target_link_libraries(tfl-verify safemain)
diff --git a/compiler/tfl-verify/requires.cmake b/compiler/tfl-verify/requires.cmake
index ed6b84d..79503f3 100644
--- a/compiler/tfl-verify/requires.cmake
+++ b/compiler/tfl-verify/requires.cmake
@@ -1,3 +1,4 @@
+require("arser")
 require("foder")
 require("mio-tflite")
 require("safemain")
diff --git a/compiler/tfl-verify/src/Driver.cpp b/compiler/tfl-verify/src/Driver.cpp
index 81f6d54..6d18976 100644
--- a/compiler/tfl-verify/src/Driver.cpp
+++ b/compiler/tfl-verify/src/Driver.cpp
@@ -16,22 +16,31 @@
 
 #include "VerifyFlatBuffers.h"
 
+#include <arser/arser.h>
+
 #include <iostream>
 #include <memory>
 #include <string>
 
 int entry(int argc, char **argv)
 {
-  if (argc != 2)
+  arser::Arser arser;
+  arser.add_argument("tflite").type(arser::DataType::STR).help("TFLite file path to verify");
+
+  try
   {
-    std::cerr << "ERROR: Failed to parse arguments" << std::endl;
-    std::cerr << std::endl;
-    std::cerr << "USAGE: " << argv[0] << " [tflite]" << std::endl;
+    arser.parse(argc, argv);
+  }
+  catch (const std::runtime_error &err)
+  {
+    std::cout << err.what() << std::endl;
+    std::cout << arser;
     return 255;
   }
+
   auto verifier = std::make_unique<VerifyFlatbuffers>();
 
-  std::string model_file = argv[argc - 1];
+  std::string model_file = arser.get<std::string>("tflite");
 
   std::cout << "[ RUN       ] Check " << model_file << std::endl;
 
diff --git a/compiler/tflchef/core/src/ModelChef.cpp b/compiler/tflchef/core/src/ModelChef.cpp
index 932a649..692ce48 100644
--- a/compiler/tflchef/core/src/ModelChef.cpp
+++ b/compiler/tflchef/core/src/ModelChef.cpp
@@ -413,6 +413,7 @@ template <typename T> void cook_graph(const T &graph, CookParams &cp)
       quant_builder.add_min(quant_min);
       quant_builder.add_scale(quant_scale);
       quant_builder.add_zero_point(quant_zero_point);
+      quant_builder.add_quantized_dimension(quant.quantized_dimension());
 
       // Update QuantizationParameters Index
       quant_index = quant_builder.Finish();
diff --git a/compiler/tflchef/proto/tflchef.proto b/compiler/tflchef/proto/tflchef.proto
index 792503b..55785c3 100644
--- a/compiler/tflchef/proto/tflchef.proto
+++ b/compiler/tflchef/proto/tflchef.proto
@@ -35,6 +35,7 @@ message TensorQuantization {
   repeated float max = 2;
   repeated float scale = 3;
   repeated int64 zero_point = 4;
+  optional int32 quantized_dimension = 5 [default = 0];
 }
 
 message Operand {
diff --git a/compiler/tflchef/tflite/src/RecipeChef.cpp b/compiler/tflchef/tflite/src/RecipeChef.cpp
index db62d0e..088961c 100644
--- a/compiler/tflchef/tflite/src/RecipeChef.cpp
+++ b/compiler/tflchef/tflite/src/RecipeChef.cpp
@@ -184,6 +184,8 @@ std::unique_ptr<ModelRecipe> generate_recipe(const tflite::Model *model)
         for (uint32_t idx = 0; idx < quant->zero_point()->size(); ++idx)
           chef_quant->add_zero_point(quant->zero_point()->Get(idx));
       }
+      tflchef::TensorQuantization *chef_quant = operand->mutable_quant();
+      chef_quant->set_quantized_dimension(quant->quantized_dimension());
     }
   }
 
diff --git a/compiler/tflchef/tools/file/Driver.cpp b/compiler/tflchef/tools/file/Driver.cpp
index cecfeeb..46e5b55 100644
--- a/compiler/tflchef/tools/file/Driver.cpp
+++ b/compiler/tflchef/tools/file/Driver.cpp
@@ -41,7 +41,7 @@ int entry(int argc, char **argv)
   {
     std::cout << err.what() << std::endl;
     std::cout << arser;
-    return 0;
+    return 255;
   }
 
   int32_t model_version = 1;
diff --git a/compiler/tflchef/tools/reverse/Driver.cpp b/compiler/tflchef/tools/reverse/Driver.cpp
index 1116dec..4d795a3 100644
--- a/compiler/tflchef/tools/reverse/Driver.cpp
+++ b/compiler/tflchef/tools/reverse/Driver.cpp
@@ -38,7 +38,7 @@ int entry(int argc, char **argv)
   {
     std::cout << err.what() << std::endl;
     std::cout << arser;
-    return 0;
+    return 255;
   }
 
   std::string tflite_path = arser.get<std::string>("tflite");
diff --git a/compiler/tfldump/driver/Driver.cpp b/compiler/tfldump/driver/Driver.cpp
index 3961d2f..38c9c06 100644
--- a/compiler/tfldump/driver/Driver.cpp
+++ b/compiler/tfldump/driver/Driver.cpp
@@ -33,7 +33,7 @@ int entry(int argc, char **argv)
   {
     std::cout << err.what() << '\n';
     std::cout << arser;
-    return 0;
+    return 255;
   }
 
   std::string tflite_path = arser.get<std::string>("tflite");
diff --git a/compiler/tflite2circle/CMakeLists.txt b/compiler/tflite2circle/CMakeLists.txt
index a0a2e02..b1d1f61 100644
--- a/compiler/tflite2circle/CMakeLists.txt
+++ b/compiler/tflite2circle/CMakeLists.txt
@@ -14,5 +14,6 @@ target_link_libraries(tflite2circle arser)
 target_link_libraries(tflite2circle safemain)
 target_link_libraries(tflite2circle mio_tflite)
 target_link_libraries(tflite2circle mio_circle)
+target_link_libraries(tflite2circle vconone)
 
 install(TARGETS tflite2circle DESTINATION bin)
diff --git a/compiler/tflite2circle/driver/Driver.cpp b/compiler/tflite2circle/driver/Driver.cpp
index 67b8e33..2f11e0a 100644
--- a/compiler/tflite2circle/driver/Driver.cpp
+++ b/compiler/tflite2circle/driver/Driver.cpp
@@ -24,10 +24,25 @@
 #include "CircleModel.h"
 #include "TFLModel.h"
 
+#include <vconone/vconone.h>
+
+void print_version(void)
+{
+  std::cout << "tflite2circle version " << vconone::get_string() << std::endl;
+  std::cout << vconone::get_copyright() << std::endl;
+}
+
 int entry(int argc, char **argv)
 {
   arser::Arser arser{"tflite2circle is a Tensorflow lite to circle model converter"};
 
+  arser.add_argument("--version")
+      .nargs(0)
+      .required(false)
+      .default_value(false)
+      .help("Show version information and exit")
+      .exit_with(print_version);
+
   arser.add_argument("tflite")
       .nargs(1)
       .type(arser::DataType::STR)
@@ -42,7 +57,7 @@ int entry(int argc, char **argv)
   {
     std::cout << err.what() << std::endl;
     std::cout << arser;
-    return 0;
+    return 255;
   }
 
   std::string tfl_path = arser.get<std::string>("tflite");
diff --git a/compiler/tflite2circle/requires.cmake b/compiler/tflite2circle/requires.cmake
index ff19b74..837c287 100644
--- a/compiler/tflite2circle/requires.cmake
+++ b/compiler/tflite2circle/requires.cmake
@@ -2,3 +2,4 @@ require("arser")
 require("mio-tflite")
 require("mio-circle")
 require("safemain")
+require("vconone")
diff --git a/compiler/vconone/CMakeLists.txt b/compiler/vconone/CMakeLists.txt
new file mode 100644
index 0000000..b8cb793
--- /dev/null
+++ b/compiler/vconone/CMakeLists.txt
@@ -0,0 +1,31 @@
+if (NOT VCONONE_VERSION)
+  set(VCONONE_VERSION 0x0000000000080001)
+  # NOTE order is [build patch minor major]
+  # if VCONONE_VERSION is set with -D option, it will be cached
+  # you may have to remove cache file if you remove -D option
+endif()
+
+configure_file(version_cfg.h.in version_cfg.h @ONLY)
+
+set(DRIVER "driver/driver.cpp")
+
+file(GLOB_RECURSE SOURCES "src/*.cpp")
+file(GLOB_RECURSE TESTS "src/*.test.cpp")
+list(REMOVE_ITEM SOURCES ${TESTS})
+
+add_library(vconone STATIC ${SOURCES})
+target_include_directories(vconone PUBLIC include)
+target_include_directories(vconone PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
+
+add_executable(one-version ${DRIVER})
+target_link_libraries(one-version vconone)
+install(TARGETS one-version DESTINATION bin)
+
+if(NOT ENABLE_TEST)
+  return()
+endif(NOT ENABLE_TEST)
+
+nnas_find_package(GTest REQUIRED)
+
+GTest_AddTest(vconone_test ${TESTS})
+target_link_libraries(vconone_test vconone)
diff --git a/compiler/vconone/README.md b/compiler/vconone/README.md
new file mode 100644
index 0000000..c08dd63
--- /dev/null
+++ b/compiler/vconone/README.md
@@ -0,0 +1,14 @@
+# vconone
+
+_vconone_ provides version number and strings for one-* commands and command
+line tools
+
+# Revise version number
+
+To revise version number, update `VCONONE_VERSION` in `CmakeLists.txt`
+or give `-DVCONONE_VERSION=0x0000000100080001` at cmake configure step.
+
+Number given is four numbers `build`, `patch`, `minor` and `major` in order for
+each 16bit integers. `build` is not used for now.
+
+`0x0000000100080001` version is interpretered as `1.8.1`
diff --git a/compiler/vconone/driver/driver.cpp b/compiler/vconone/driver/driver.cpp
new file mode 100644
index 0000000..12bd0ee
--- /dev/null
+++ b/compiler/vconone/driver/driver.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vconone/vconone.h>
+
+#include <string>
+#include <iostream>
+
+int main(int argc, char *argv[])
+{
+  auto str = vconone::get_string();
+  if (argc >= 2)
+  {
+    for (int c = 1; c < argc; ++c)
+      std::cout << argv[c] << " ";
+    std::cout << "version " << str << std::endl;
+    std::cout << vconone::get_copyright() << std::endl;
+  }
+  else
+    std::cout << str;
+
+  return 0;
+}
diff --git a/compiler/vconone/include/vconone/vconone.h b/compiler/vconone/include/vconone/vconone.h
new file mode 100644
index 0000000..a6a1998
--- /dev/null
+++ b/compiler/vconone/include/vconone/vconone.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __VCON_ONE_H__
+#define __VCON_ONE_H__
+
+#include <cstdint>
+#include <string>
+
+namespace vconone
+{
+
+struct four
+{
+  uint16_t major;
+  uint16_t minor;
+  uint16_t patch;
+  uint16_t build; // build is not used for now
+};
+
+union version {
+  uint64_t v;
+  four f;
+};
+
+/**
+ * @brief get_number will return version union structure
+ */
+version get_number(void);
+
+/**
+ * @brief get_string will return string of major.minor.patch (without build)
+ */
+std::string get_string(void);
+
+/**
+ * @brief get_string4 will return string of major.minor.patch.build
+ */
+std::string get_string4(void);
+
+/**
+ * @brief get_copyright will return copyright string
+ */
+std::string get_copyright(void);
+
+} // namespace vconone
+
+#endif // __VCON_ONE_H__
diff --git a/compiler/vconone/src/version.cpp b/compiler/vconone/src/version.cpp
new file mode 100644
index 0000000..9b693c6
--- /dev/null
+++ b/compiler/vconone/src/version.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "vconone/vconone.h"
+
+#include "version_cfg.h"
+
+#include <sstream>
+
+namespace vconone
+{
+
+version get_number(void)
+{
+  version v;
+  v.v = VCONONE_VERSION;
+  return v;
+}
+
+std::string get_string4(void)
+{
+  std::ostringstream ss;
+
+  auto v = get_number();
+  ss << unsigned(v.f.major) << "." << unsigned(v.f.minor) << "." << unsigned(v.f.patch) << "."
+     << unsigned(v.f.build);
+
+  return ss.str();
+}
+
+std::string get_string(void)
+{
+  std::ostringstream ss;
+
+  auto v = get_number();
+  ss << unsigned(v.f.major) << "." << unsigned(v.f.minor) << "." << unsigned(v.f.patch);
+
+  return ss.str();
+}
+
+std::string get_copyright(void)
+{
+  std::string str;
+  str = "Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved\r\n";
+  str += "Licensed under the Apache License, Version 2.0\r\n";
+  str += "https://github.com/Samsung/ONE";
+  return str;
+}
+
+} // namespace vconone
diff --git a/compiler/vconone/src/version.test.cpp b/compiler/vconone/src/version.test.cpp
new file mode 100644
index 0000000..35a0647
--- /dev/null
+++ b/compiler/vconone/src/version.test.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vconone/vconone.h>
+
+#include <gtest/gtest.h>
+
+TEST(vconone, version_number)
+{
+  auto v = vconone::get_number();
+
+  ASSERT_NE(0x0000000000000000ULL, v.v);
+}
+
+TEST(vconone, version_string)
+{
+  auto str = vconone::get_string();
+
+  ASSERT_NE("..", str);
+  ASSERT_NE("", str);
+}
+
+TEST(vconone, version_string4)
+{
+  auto str = vconone::get_string4();
+
+  ASSERT_NE("...", str);
+  ASSERT_NE("", str);
+}
+
+TEST(vconone, copyright)
+{
+  auto str = vconone::get_copyright();
+
+  ASSERT_NE("", str);
+}
diff --git a/compiler/vconone/version_cfg.h.in b/compiler/vconone/version_cfg.h.in
new file mode 100644
index 0000000..aa3ad9e
--- /dev/null
+++ b/compiler/vconone/version_cfg.h.in
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __VCON_ONE_VERSION_CFG_H__
+#define __VCON_ONE_VERSION_CFG_H__
+
+#define VCONONE_VERSION @VCONONE_VERSION@ULL
+
+#endif // __VCON_ONE_VERSION_CFG_H__
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgOperationKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgOperationKernel.h
deleted file mode 100644
index 9699b5c..0000000
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgOperationKernel.h
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/**
- * @file CLArgOperationKernel.h
- * @brief This file defines CLArgOperationKernel
- * @ingroup COM_AI_RUNTIME
- */
-
-#ifndef __ARM_COMPUTE_CLARGOPERATIONKERNEL_H__
-#define __ARM_COMPUTE_CLARGOPERATIONKERNEL_H__
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/TypesEx.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/**
- * @brief Class to define interface for the argop kernel.
- */
-class CLArgOperationKernel : public ICLKernel
-{
-public:
-  /**
-   * @brief Default constructor.
-   */
-  CLArgOperationKernel();
-  /**
-   * @brief Prevent instances of this class from being copied (As this class contains pointers).
-   * @param [in] copiedInstance Const reference of CLArgOperationKernel to be copied
-   */
-  CLArgOperationKernel(const CLArgOperationKernel &) = delete;
-  /**
-   * @brief Prevent instances of this class from being copied (As this class contains pointers).
-   * @param [in] copiedInstance Const reference of CLArgOperationKernel to be copied
-   * @return Reference of this instance
-   */
-  CLArgOperationKernel &operator=(const CLArgOperationKernel &) = delete;
-  /**
-   * @brief Allow instances of this class to be moved
-   * @param [in] movedInstance Rvalue reference of CLArgOperationKernel to be moved
-   */
-  CLArgOperationKernel(CLArgOperationKernel &&) = default;
-  /**
-   * @brief Allow instances of this class to be moved
-   * @param [in] movedInstance Rvalue reference of CLArgOperationKernel to be moved
-   * @return Reference of this instance
-   */
-  CLArgOperationKernel &operator=(CLArgOperationKernel &&) = default;
-  /**
-   * @brief Initialise the kernel's input, output and border mode.
-   * @param[in]  input          An input tensor. Data types supported: U8/QASYMM8/S32/F32.
-   * @param[out] output         The output tensor, Data types supported: S32.
-   * @param[in]  axis           Axis along which to reduce. It must be sorted and no duplicates.
-   * @param[in]  op             Arg operation to perform.
-   * return N/A
-   */
-  void configure(const ICLTensor *input, ICLTensor *output, const uint32_t axis, ArgOperation op);
-  /**
-   * @brief Static function to check if given info will lead to a valid configuration of @ref
-   * CLArgOperationKernel
-   * @param[in] input           An input tensor info. Data types supported: U8/QASYMM8/S32/F32.
-   * @param[in] output          The output tensor info, Data types supported: S32.
-   * @param[in] axis            Axis along which to reduce. It must be sorted and no duplicates.
-   * @param[in] op              Arg operation to perform.
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis,
-                         ArgOperation op);
-
-  /*
-   * @brief Run CLArgOperationKernel op
-   * @param[in] window  Window to be used for in_slice
-   * @param[in] queue   cl::CommandQueue
-   * @return N/A
-   */
-  void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-  const ICLTensor *_input;
-  ICLTensor *_output;
-  uint32_t _axis;
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_CLARGOPERATIONKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h
deleted file mode 100644
index b0357fe..0000000
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/**
- * @file      CLCastKernel.h
- * @ingroup   COM_AI_RUNTIME
- * @brief     This file defines CLCastKernel class
- */
-
-#ifndef __ARM_COMPUTE_CLCASTKERNEL_H__
-#define __ARM_COMPUTE_CLCASTKERNEL_H__
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/TypesEx.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/**
- * @brief Class to define OpenCL kernel for cast operation
- */
-class CLCastKernel : public ICLKernel
-{
-public:
-  /**
-   * @brief Construct CLCastKernel object
-   */
-  CLCastKernel();
-
-  /**
-   * @brief Prevent instances of this class from being copied (As this class contains pointers)
-   */
-  CLCastKernel(const CLCastKernel &) = delete;
-
-  /**
-   * @brief Prevent instances of this class from being copied (As this class contains pointers)
-   */
-  CLCastKernel &operator=(const CLCastKernel &) = delete;
-
-  /**
-   * @brief Construct CLCastKernel object using default move constructor
-   * @param[in] CLCastKernel object to move
-   */
-  CLCastKernel(CLCastKernel &&) = default;
-
-  /**
-   * @brief Allow instances of this class to be moved
-   * @param[in] CLCastKernel object to move
-   */
-  CLCastKernel &operator=(CLCastKernel &&) = default;
-
-  /**
-   * @brief Destruct this CLCastKernel object
-   */
-  ~CLCastKernel() = default;
-
-  /**
-   * @brief Initialise the kernel's input and output.
-   * @param[in]  input  Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
-   * @param[in]  output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
-   * @param[in]  input_subtype  Sub data type of input.
-   * @return N/A
-   */
-  void configure(const ICLTensor *input, ICLTensor *output, SubDataType input_subtype);
-
-  /**
-   * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command
-   *        queue.
-   * @note  The queue is *not* flushed by this method, and therefore the kernel will not have
-   *        been executed by the time this method returns.
-   * @param[in] window      Region on which to execute the kernel. (Must be a valid region of
-   *                        the window returned by window()).
-   * @param[in,out] queue   Command queue on which to enqueue the kernel.@return N/A
-   * @return N/A
-   */
-  void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-  const ICLTensor *_input; /**< Source tensor */
-  ICLTensor *_output;      /**< Destination tensor */
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_CLCASTKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h
deleted file mode 100644
index 8615cf1..0000000
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_CLDEPTHTOSPACEKERNEL_H__
-#define __ARM_COMPUTE_CLDEPTHTOSPACEKERNEL_H__
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to perform depthTospace operation */
-class CLDepthToSpaceKernel : public ICLKernel
-{
-public:
-  /** Default constructor */
-  CLDepthToSpaceKernel();
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLDepthToSpaceKernel(const CLDepthToSpaceKernel &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLDepthToSpaceKernel &operator=(const CLDepthToSpaceKernel &) = delete;
-  /** Allow instances of this class to be moved */
-  CLDepthToSpaceKernel(CLDepthToSpaceKernel &&) = default;
-  /** Allow instances of this class to be moved */
-  CLDepthToSpaceKernel &operator=(CLDepthToSpaceKernel &&) = default;
-  /** Default destructor */
-  ~CLDepthToSpaceKernel() = default;
-  /** Initialise the kernel's input and output.
-   *
-   * @param[in]  input  Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
-   * @param[in]  output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
-   */
-  void configure(const ICLTensor *input, ICLTensor *output, const int32_t block_size);
-
-  // Inherited methods overridden:
-  void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-  const ICLTensor *_input; /**< Source tensor */
-  ICLTensor *_output;      /**< Destination tensor */
-};
-
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_CLDEPTHTOSPACEKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.h
deleted file mode 100644
index 9321c36..0000000
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.h
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYKERNELEX_H__
-#define __ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYKERNELEX_H__
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to multiply matrices
- *
- * @note This kernel should be used ONLY for Midgard architectures
- *
- * This kernel performs the following computation:
- *
- *  -# Convert a values from int8 to int32
- *  -# Convert b values from int8 to int32
- *  -# Compute the int32 matrix product of the resulting a * b and store the result as int32
- *
- */
-class CLGEMMLowpMatrixMultiplyKernelEx : public ICLKernel
-{
-public:
-  /** Default Constructor */
-  CLGEMMLowpMatrixMultiplyKernelEx();
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLGEMMLowpMatrixMultiplyKernelEx(const CLGEMMLowpMatrixMultiplyKernelEx &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLGEMMLowpMatrixMultiplyKernelEx &operator=(const CLGEMMLowpMatrixMultiplyKernelEx &) = delete;
-  /** Allow instances of this class to be moved */
-  CLGEMMLowpMatrixMultiplyKernelEx(CLGEMMLowpMatrixMultiplyKernelEx &&) = default;
-  /** Allow instances of this class to be moved */
-  CLGEMMLowpMatrixMultiplyKernelEx &operator=(CLGEMMLowpMatrixMultiplyKernelEx &&) = default;
-  /** Initialise the kernel's input and output.
-   *
-   * @note This kernel should be used ONLY for Midgard architectures
-   *
-   * @param[in]  input0    Input tensor containing the LHS matrix. Data type supported: QASYMM8
-   * @param[in]  input1    Input tensor containing the RHS matrix. Data type supported: same as @p
-   * input0
-   * @param[out] output    Output tensor to store the result of matrix multiplication. Data type
-   * supported: S32
-   * @param[in]  gemm_info (Optional) GEMM information used to retrieve the original dimensions of
-   * the input matrices
-   */
-  void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output,
-                 const GEMMReshapeInfo &gemm_info = GEMMReshapeInfo());
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * CLGEMMLowpMatrixMultiplyKernelEx
-   *
-   * @param[in] input0    Input tensor containing the LHS matrix. Data type supported: QASYMM8
-   * @param[in] input1    Input tensor containing the RHS matrix. Data type supported: same as @p
-   * input0
-   * @param[in] output    Output tensor to store the result of matrix multiplication. Data type
-   * supported: S32
-   * @param[in] gemm_info (Optional) GEMM information used to retrieve the original dimensions of
-   * the input matrices
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input0, const ITensorInfo *input1,
-                         const ITensorInfo *output,
-                         const GEMMReshapeInfo &gemm_info = GEMMReshapeInfo());
-
-  // Inherited methods overridden:
-  void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-  const ICLTensor *_input0;
-  const ICLTensor *_input1;
-  ICLTensor *_output;
-  bool _slide_matrix_b;
-  bool _reinterpret_input_as_3d;
-  bool _reinterpret_output_as_3d;
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYKERNELEX_H__*/
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLPReLUKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLPReLUKernel.h
deleted file mode 100644
index dd2dbf6..0000000
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLPReLUKernel.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_CLPRELU_KERNEL_H__
-#define __ARM_COMPUTE_CLPRELU_KERNEL_H__
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to calculate PReLU*/
-class CLPReLUKernel : public ICLKernel
-{
-public:
-  /** Default constructor */
-  CLPReLUKernel();
-  /** Prevent instances of this class from being copied (As this class contains pointers). */
-  CLPReLUKernel(const CLPReLUKernel &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers). */
-  CLPReLUKernel &operator=(const CLPReLUKernel &) = delete;
-  /** Allow instances of this class to be moved */
-  CLPReLUKernel(CLPReLUKernel &&) = default;
-  /** Allow instances of this class to be moved */
-  CLPReLUKernel &operator=(CLPReLUKernel &&) = default;
-  /** Initialize the kernel's input, output.
-   *
-   * @param[in]  input  Source tensor1.
-   * @param[in]  alpha  Source tensor2.
-   * @param[out] output  Output tensor.
-   */
-  void configure(const ICLTensor *input, const ICLTensor *alpha, ICLTensor *output);
-
-  // Inherited methods overridden:
-  void run(const Window &window, cl::CommandQueue &queue) override;
-
-  BorderSize border_size() const override;
-
-private:
-  const ICLTensor *_input;
-  const ICLTensor *_alpha;
-  ICLTensor *_output;
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_CLPRELU_KERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h
deleted file mode 100644
index 4c0a82c..0000000
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_CLSPACETODEPTHKERNEL_H__
-#define __ARM_COMPUTE_CLSPACETODEPTHKERNEL_H__
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to perform spaceTodepth operation */
-class CLSpaceToDepthKernel : public ICLKernel
-{
-public:
-  /** Default constructor */
-  CLSpaceToDepthKernel();
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLSpaceToDepthKernel(const CLSpaceToDepthKernel &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLSpaceToDepthKernel &operator=(const CLSpaceToDepthKernel &) = delete;
-  /** Allow instances of this class to be moved */
-  CLSpaceToDepthKernel(CLSpaceToDepthKernel &&) = default;
-  /** Allow instances of this class to be moved */
-  CLSpaceToDepthKernel &operator=(CLSpaceToDepthKernel &&) = default;
-  /** Default destructor */
-  ~CLSpaceToDepthKernel() = default;
-  /** Initialise the kernel's input and output.
-   *
-   * @param[in]  input  Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
-   * @param[in]  output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
-   */
-  void configure(const ICLTensor *input, ICLTensor *output, const int32_t block_size);
-
-  // Inherited methods overridden:
-  void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-  const ICLTensor *_input; /**< Source tensor */
-  ICLTensor *_output;      /**< Destination tensor */
-};
-
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_CLSPACETODEPTHKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h
deleted file mode 100644
index 9d174de..0000000
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLEKERNEL_H__
-#define __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLEKERNEL_H__
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the Upsampling layer kernel for transpose convolution on OpenCL.
- */
-class CLTransposeConvLayerUpsampleKernel : public ICLKernel
-{
-public:
-  /** Constructor */
-  CLTransposeConvLayerUpsampleKernel();
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLTransposeConvLayerUpsampleKernel(const CLTransposeConvLayerUpsampleKernel &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLTransposeConvLayerUpsampleKernel &
-  operator=(const CLTransposeConvLayerUpsampleKernel &) = delete;
-  /** Default Move Constructor. */
-  CLTransposeConvLayerUpsampleKernel(CLTransposeConvLayerUpsampleKernel &&) = default;
-  /** Default move assignment operator */
-  CLTransposeConvLayerUpsampleKernel &operator=(CLTransposeConvLayerUpsampleKernel &&) = default;
-  /** Default destructor */
-  ~CLTransposeConvLayerUpsampleKernel() = default;
-
-  /** Initialise the kernel's input and output.
-   *
-   * @param[in]  input        Source tensor. Data types supported: QASYMM8/F16/F32.
-   * @param[out] output       Destination tensor. Data types supported: same as @p input. All but
-   * the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only
-   * performed within the XY-plane.
-   * @param[in]  inner_border Top and right inner border sizes. These rows and columns will be
-   * filled with zero.
-   * @param[in]  info         Contains padding and stride information described in @ref
-   * PadStrideInfo.
-   */
-  void configure(const ICLTensor *input, ICLTensor *output, const BorderSize &inner_border,
-                 const PadStrideInfo &info);
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * CLTransposeConvLayerUpsample
-   *
-   * @param[in] input        Source tensor info. Data types supported: QASYMM8/F16/F32.
-   * @param[in] output       Destination tensor info. Data types supported: same as @p input. All
-   * but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is
-   * only performed within the XY-plane.
-   * @param[in] inner_border Top and right inner border sizes. These rows and columns will be filled
-   * with zero.
-   * @param[in] info         Contains padding and stride information described in @ref
-   * PadStrideInfo.
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                         const BorderSize &inner_border, const PadStrideInfo &info);
-
-  // Inherited methods overridden:
-  void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-  const ICLTensor *_input;
-  ICLTensor *_output;
-  BorderSize _inner_border;
-  PadStrideInfo _info;
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLEKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h b/compute/ARMComputeEx/arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h
deleted file mode 100644
index d4c9c61..0000000
--- a/compute/ARMComputeEx/arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_CPPUPSAMPLEKERNEL_EX_H__
-#define __ARM_COMPUTE_CPPUPSAMPLEKERNEL_EX_H__
-
-#include "arm_compute/core/CPP/ICPPKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** CPP kernel to perform tensor upsample.
- *
- */
-class CPPUpsampleKernelEx : public ICPPKernel
-{
-public:
-  const char *name() const override { return "CPPUpsampleKernelEx"; }
-  /** Default constructor */
-  CPPUpsampleKernelEx();
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CPPUpsampleKernelEx(const CPPUpsampleKernelEx &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CPPUpsampleKernelEx &operator=(const CPPUpsampleKernelEx &) = delete;
-  /** Allow instances of this class to be moved */
-  CPPUpsampleKernelEx(CPPUpsampleKernelEx &&) = default;
-  /** Allow instances of this class to be moved */
-  CPPUpsampleKernelEx &operator=(CPPUpsampleKernelEx &&) = default;
-  /** Default destructor */
-  ~CPPUpsampleKernelEx() = default;
-
-  /** Set the input and output of the kernel.
-   *
-   * @param[in]  input  The input tensor to upsample. Data types supported: F32/F16/QASYMM8
-   * @param[out] output The output tensor. Data types supported: Same as @p input
-   * @param[in]  info   Padding info.
-   */
-  void configure(const ITensor *input, ITensor *output, const PadStrideInfo &info);
-
-  // Inherited methods overridden:
-  void run(const Window &window, const ThreadInfo &info) override;
-  bool is_parallelisable() const override;
-
-private:
-  const ITensor *_input;
-  ITensor *_output;
-  PadStrideInfo _info;
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_CPPUPSAMPLEKERNEL_EX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastKernel.h
deleted file mode 100644
index 4e9f097..0000000
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastKernel.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NECASTKERNEL_H__
-#define __ARM_COMPUTE_NECASTKERNEL_H__
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/TypesEx.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the cast layer kernel. */
-class NECastKernel : public INEKernel
-{
-public:
-  const char *name() const override { return "NECastKernel"; }
-  /** Default constructor */
-  NECastKernel();
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  NECastKernel(const NECastKernel &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  NECastKernel &operator=(const NECastKernel &) = delete;
-  /** Default Move Constructor. */
-  NECastKernel(NECastKernel &&) = default;
-  /** Default move assignment operator */
-  NECastKernel &operator=(NECastKernel &&) = default;
-  /** Default destructor */
-  ~NECastKernel() = default;
-  /** Set input, output tensors.
-   *
-   * @param[in]  input  Source tensor. Data type supported: U8/S8/QASYMM8/U32/S32/F32.
-   * @param[out] output Destination tensor with the same dimensions of input. Data type supported:
-   * U8/S8/QASYMM8/U32/S32/F32.
-   * @param[in]  input_subtype  Sub data type of input.
-   */
-  void configure(const ITensor *input, ITensor *output, SubDataType input_subtype);
-  /** Static function to check if given info will lead to a valid configuration of @ref NECastKernel
-   *
-   * @param[in] input  Input tensor info. Data types supported: U8/S8/QASYMM8/U32/S32/F32.
-   * @param[in] output Output tensor info. Data types supported: U8/S8/QASYMM8/U32/S32/F32.
-   * @param[in] input_subtype  Sub data type of input.
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                         SubDataType input_subtype);
-
-  // Inherited methods overridden:
-  void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-  const ITensor *_input;
-  ITensor *_output;
-  SubDataType _input_subtype;
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_NECASTKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h
deleted file mode 100644
index b62897e..0000000
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNELEX_H__
-#define __ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNELEX_H__
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the depth to space kernel */
-class NEDepthToSpaceLayerKernelEx : public INEKernel
-{
-public:
-  const char *name() const override { return "NEDepthToSpaceLayerKernelEx"; }
-  /** Default constructor */
-  NEDepthToSpaceLayerKernelEx();
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  NEDepthToSpaceLayerKernelEx(const NEDepthToSpaceLayerKernelEx &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  NEDepthToSpaceLayerKernelEx &operator=(const NEDepthToSpaceLayerKernelEx &) = delete;
-  /** Allow instances of this class to be moved */
-  NEDepthToSpaceLayerKernelEx(NEDepthToSpaceLayerKernelEx &&) = default;
-  /** Allow instances of this class to be moved */
-  NEDepthToSpaceLayerKernelEx &operator=(NEDepthToSpaceLayerKernelEx &&) = default;
-  /** Default destructor */
-  ~NEDepthToSpaceLayerKernelEx() = default;
-  /** Initialise the kernel's inputs and output.
-   *
-   * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported:
-   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
-   * @param[out] output      Tensor output. Data types supported: same as @p input
-   * @param[in]  block_shape Block shape x value.
-   */
-  void configure(const ITensor *input, ITensor *output, int32_t block_shape);
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * NEDepthToSpaceLayerKernelEx.
-   *
-   * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported:
-   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
-   * @param[in] output      Tensor output info. Data types supported: same as @p input
-   * @param[in] block_shape Block shape value.
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
-
-  // Inherited methods overridden:
-  void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-  const ITensor *_input; /**< Source tensor */
-  ITensor *_output;      /**< Destination tensor */
-  int32_t _block_shape;  /**< Block shape */
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNELEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h
deleted file mode 100644
index 57de78d..0000000
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NEELEMENTWISEUNARYKERNELEX_H__
-#define __ARM_COMPUTE_NEELEMENTWISEUNARYKERNELEX_H__
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/TypesEx.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for an element-wise unary operation kernel
- *
- * Element-wise operation is computed by:
- * @f[ output(x) = OP(input(x))@f]
- *
- */
-class NEElementwiseUnaryKernelEx : public INEKernel
-{
-public:
-  const char *name() const override { return "NEElementwiseUnaryKernelEx"; }
-  /** Default constructor */
-  NEElementwiseUnaryKernelEx();
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  NEElementwiseUnaryKernelEx(const NEElementwiseUnaryKernelEx &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  NEElementwiseUnaryKernelEx &operator=(const NEElementwiseUnaryKernelEx &) = delete;
-  /** Allow instances of this class to be moved */
-  NEElementwiseUnaryKernelEx(NEElementwiseUnaryKernelEx &&) = default;
-  /** Allow instances of this class to be moved */
-  NEElementwiseUnaryKernelEx &operator=(NEElementwiseUnaryKernelEx &&) = default;
-  /** Default destructor */
-  ~NEElementwiseUnaryKernelEx() = default;
-
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * NEElementwiseUnaryKernelEx
-   *
-   * @param[in] op     Arithmetic operation to be executed.
-   * @param[in] input  First tensor input. Data types supported: F16/F32/S32.
-   * @param[in] output Output tensor. Data types supported: Same as @p input.
-   */
-  void configure(ElementWiseUnaryEx op, const ITensor *input, ITensor *output);
-
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * NEElementwiseUnaryKernelEx
-   *
-   * @param[in] op     Arithmetic operation to be executed.
-   * @param[in] input  First tensor input info. Data types supported: F16/F32/S32.
-   * @param[in] output Output tensor info. Data types supported: Same as @p input.
-   *
-   * @return a Status
-   */
-  static Status validate(ElementWiseUnaryEx op, const ITensorInfo *input,
-                         const ITensorInfo *output);
-
-  // Inherited methods overridden:
-  void run(const Window &window, const ThreadInfo &info) override;
-
-  /** Common signature for all the specialised arithmetic functions
-   *
-   * @param[in]  input  An input tensor. Data types supported: F16/F32/S32.
-   * @param[out] output The output tensor. Data types supported: Same as @p input.
-   * @param[in]  window Region on which to execute the kernel.
-   */
-  using ElementwiseUnaryFunction = void(const ITensor *input, ITensor *output,
-                                        const Window &window);
-
-protected:
-  // Inherited methods overridden:
-  static Status validate_arguments(const ITensorInfo &input, const ITensorInfo &output);
-
-  /** Function to use for the particular tensor types passed to configure() */
-  std::function<void(const ITensor *input, ITensor *output, const Window &window)> _function;
-
-  const ITensor *_input;
-  ITensor *_output;
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_NEELEMENTWISEUNARYKERNELEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEPReLUKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEPReLUKernel.h
deleted file mode 100644
index 722efd3..0000000
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEPReLUKernel.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NEPRELUKERNEL_H__
-#define __ARM_COMPUTE_NEPRELUKERNEL_H__
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the kernel to perform Parametric Rectified Linear Unit
- *
- * Result is computed by:
- * @f[ output(x) = alpha * x for x < 0, output(x) = x for x >= 0 @f]
- */
-class NEPReLUKernel : public INEKernel
-{
-public:
-  const char *name() const override { return "NEPReLUKernel"; }
-  /** Default constructor */
-  NEPReLUKernel();
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  NEPReLUKernel(const NEPReLUKernel &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  NEPReLUKernel &operator=(const NEPReLUKernel &) = delete;
-  /** Allow instances of this class to be moved */
-  NEPReLUKernel(NEPReLUKernel &&) = default;
-  /** Allow instances of this class to be moved */
-  NEPReLUKernel &operator=(NEPReLUKernel &&) = default;
-  /** Initialise the kernel's inputs and output
-   *
-   * @param[in]  input Input tensor. Data type supported: QASYMM8/F32
-   * @param[in]  alpha Alpha tensor. Data types supported: Same as @p input
-   * @param[out] output Output tensor. Data types supported: Same as @p input
-   */
-  void configure(const ITensor *input, const ITensor *alpha, ITensor *output);
-
-  // Inherited methods overridden:
-  void run(const Window &window, const ThreadInfo &info) override;
-
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * NEPReLUKernel.h
-   *
-   * @param[in] input  Input tensor input info. Data types supported: QASYMM8/F32.
-   * @param[in] alpha  Alpha tensor input info. Data types supported: Same as @p input.
-   * @param[in] output Output tensor info. Data types supported: Same as @p input.
-   *
-   * @return a Status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *alpha,
-                         const ITensorInfo *output);
-  static Status validate_arguments(const ITensorInfo &input, const ITensorInfo &alpha,
-                                   const ITensorInfo &output);
-
-private:
-  const ITensor *_input; /**< Source tensor */
-  const ITensor *_alpha; /**< Alpha tensor */
-  ITensor *_output;      /**< Destination tensor */
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_NEPRELUKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h
deleted file mode 100644
index 0ffcf6b..0000000
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NESPACETODEPTHLAYERKERNELEX_H__
-#define __ARM_COMPUTE_NESPACETODEPTHLAYERKERNELEX_H__
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the space to depth kernel */
-class NESpaceToDepthLayerKernelEx : public INEKernel
-{
-public:
-  const char *name() const override { return "NESpaceToDepthLayerKernelEx"; }
-  /** Default constructor */
-  NESpaceToDepthLayerKernelEx();
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  NESpaceToDepthLayerKernelEx(const NESpaceToDepthLayerKernelEx &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  NESpaceToDepthLayerKernelEx &operator=(const NESpaceToDepthLayerKernelEx &) = delete;
-  /** Allow instances of this class to be moved */
-  NESpaceToDepthLayerKernelEx(NESpaceToDepthLayerKernelEx &&) = default;
-  /** Allow instances of this class to be moved */
-  NESpaceToDepthLayerKernelEx &operator=(NESpaceToDepthLayerKernelEx &&) = default;
-  /** Default destructor */
-  ~NESpaceToDepthLayerKernelEx() = default;
-  /** Initialise the kernel's inputs and output.
-   *
-   * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported:
-   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
-   * @param[out] output      Tensor output. Data types supported: same as @p input
-   * @param[in]  block_shape Block shape value
-   */
-  void configure(const ITensor *input, ITensor *output, int32_t block_shape);
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * NESpaceToDepthLayerKernelEx
-   *
-   * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported:
-   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
-   * @param[in] output      Tensor output info. Data types supported: same as @p input
-   * @param[in] block_shape Block shape value
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
-
-  // Inherited methods overridden:
-  void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-  const ITensor *_input; /**< Source tensor */
-  ITensor *_output;      /**< Destination tensor */
-  int32_t _block_shape;  /**< Block shape */
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_NESPACETODEPTHLAYERKERNELEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h
index 97bc4ce..cfbd134 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h
@@ -16,25 +16,14 @@
 #ifndef __ARM_COMPUTE_CLFUNCTIONSEX_H__
 #define __ARM_COMPUTE_CLFUNCTIONSEX_H__
 
-#include <arm_compute/runtime/CL/functions/CLArgOperation.h>
-#include <arm_compute/runtime/CL/functions/CLBatchToSpaceND.h>
 #include <arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h>
-#include <arm_compute/runtime/CL/functions/CLCast.h>
-#include <arm_compute/runtime/CL/functions/CLDepthToSpace.h>
 #include <arm_compute/runtime/CL/functions/CLEmbeddingLookup.h>
 #include <arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h>
 #include <arm_compute/runtime/CL/functions/CLGatherEx.h>
 #include <arm_compute/runtime/CL/functions/CLHashtableLookup.h>
 #include <arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h>
-#include <arm_compute/runtime/CL/functions/CLLogicalNot.h>
 #include <arm_compute/runtime/CL/functions/CLNeg.h>
-#include <arm_compute/runtime/CL/functions/CLPixelWiseDivision.h>
-#include <arm_compute/runtime/CL/functions/CLPReLU.h>
 #include <arm_compute/runtime/CL/functions/CLReduceOperation.h>
-#include <arm_compute/runtime/CL/functions/CLRNNLayerEx.h>
-#include <arm_compute/runtime/CL/functions/CLSpaceToDepth.h>
-#include <arm_compute/runtime/CL/functions/CLSplit.h>
-#include <arm_compute/runtime/CL/functions/CLStridedSliceEx.h>
 #include <arm_compute/runtime/CL/functions/CLTopKV2.h>
 #include <arm_compute/runtime/CL/functions/CLTransposeConvLayer.h>
 
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgOperation.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgOperation.h
deleted file mode 100644
index c37096f..0000000
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgOperation.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/**
- * @file CLArgOperation.h
- * @ingroup COM_AI_RUNTIME
- * @brief This file contains arm_compute::CLArgOperation class
- */
-
-#ifndef __ARM_COMPUTE_CLARGOPERATION_H__
-#define __ARM_COMPUTE_CLARGOPERATION_H__
-
-#include "arm_compute/core/CL/kernels/CLArgOperationKernel.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/core/TypesEx.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/**
- * @brief Class to execute CLArgOperation operation
- */
-class CLArgOperation : public IFunction
-{
-public:
-  /**
-   * @brief Construct a new CLArgOperation object
-   */
-  CLArgOperation();
-
-  /**
-   * @brief Prevent instances of this class from being copied (As this class contains pointers)
-   */
-  CLArgOperation(const CLArgOperation &) = delete;
-
-  /**
-   * @brief Prevent instances of this class from being copied (As this class contains pointers)
-   */
-  CLArgOperation &operator=(const CLArgOperation &) = delete;
-
-  /**
-   * @brief Construct a new CLArgOperation object by using copy constructor
-   * @param[in] CLArgOperation object to move
-   */
-  CLArgOperation(CLArgOperation &&) = default;
-
-  /**
-   * @brief Assign a CLArgOperation object.
-   * @param[in] CLArgOperation object to assign. This object will be moved.
-   */
-  CLArgOperation &operator=(CLArgOperation &&) = default;
-
-  /**
-   * @brief Initialise the kernel's inputs and outputs.
-   * @param[in]  input     Input tensor. Data types supported: U8/QASYMM8/S32/F32.
-   * @param[out] output    The result of arg operation. Data types supported: S32.
-   * @param[in]  axis      Axis along which to reduce. It must be sorted and no duplicates.
-   * @param[in]  op        Arg operation to perform.
-   * @return N/A
-   */
-  void configure(ICLTensor *input, ICLTensor *output, std::vector<uint32_t> axis, ArgOperation op);
-
-  /**
-   * @brief Static function to check if given info will lead to a valid configuration
-   * @param[in]  input     Input tensor. Data types supported: U8/QASYMM8/S32/F32.
-   * @param[in]  axis      Axis along which to reduce. It must be sorted and no duplicates.
-   * @param[out] output    The result of arg operation. Data types supported: S32.
-   * @param[in]  op        Arg operation to perform.
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const std::vector<uint32_t> &axis,
-                         const ITensorInfo *output, ArgOperation op);
-  /**
-   * @brief Run the OpenCL kernel for this operation
-   * @return N/A
-   */
-  void run() override;
-
-private:
-  ICLTensor *_input{nullptr};
-  ICLTensor *_output{nullptr};
-  std::vector<uint32_t> _axis{};
-  ArgOperation _arg_op{ArgOperation::MAX};
-
-  std::unique_ptr<CLTensor[]> _interm_tensors{nullptr};
-  std::unique_ptr<CLArgOperationKernel[]> _argop_kernels{nullptr};
-  size_t _num_of_kernels{0};
-};
-}
-#endif /*__ARM_COMPUTE_CLARGOPERATION_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBatchToSpaceND.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBatchToSpaceND.h
deleted file mode 100644
index eed5cb8..0000000
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBatchToSpaceND.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_H__
-#define __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_H__
-
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to run @ref CLBatchToSpaceNDKernel
- *
- * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32.
- * @note The function converts the input tensor to the tensor of the output tensor's type.
- */
-class CLBatchToSpaceND : public ICLSimpleFunction
-{
-public:
-  /** Initialise the kernel's input and output.
-   *
-   * @param[in]  input              Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
-   * @param[out] output             Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
-   * @param[in]  block_size         A pointer to an array of integer values specifying block sizes
-   *                                for spatial dimension.
-   */
-  void configure(ICLTensor *input, ICLTensor *output, const int32_t *block_size);
-};
-
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h
deleted file mode 100644
index ebe0d8a..0000000
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/**
- * @file CLCast.h
- * @ingroup COM_AI_RUNTIME
- * @brief This file contains arm_compute::CLCast class
- */
-
-#ifndef __ARM_COMPUTE_CLCAST_H__
-#define __ARM_COMPUTE_CLCAST_H__
-
-#include "arm_compute/core/TypesEx.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/**
- * @brief Class to run @ref CLCastKernel.
- * This converts the input tensor to the tensor of the output tensor's type.
- */
-class CLCast : public ICLSimpleFunction
-{
-public:
-  /**
-   * @brief Initialise the kernel's input and output
-   * @param[in, out] input    Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
-   *                          The input tensor is [in, out] because its TensorInfo might be
-   *                          modified inside the kernel.
-   * @param[out]     output   Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
-   * @param[in]      input_subtype  Sub data type of input.
-   */
-  void configure(ICLTensor *input, ICLTensor *output, SubDataType input_subtype);
-};
-}
-#endif /* __ARM_COMPUTE_CLCAST_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDepthToSpace.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDepthToSpace.h
deleted file mode 100644
index d52a538..0000000
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDepthToSpace.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_CLDEPTHTOSPACE_H__
-#define __ARM_COMPUTE_CLDEPTHTOSPACE_H__
-
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to run @ref CLDepthToSpaceKernel
- *
- * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32.
- * @note The function converts the input tensor to the tensor of the output tensor's type.
- */
-class CLDepthToSpace : public ICLSimpleFunction
-{
-public:
-  /** Initialise the kernel's input and output.
-   *
-   * @param[in]  input              Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
-   * @param[out] output             Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
-   * @param[block_size] block size  integer only
-   */
-  void configure(ICLTensor *input, ICLTensor *output, const int32_t block_size);
-};
-} // namesace arm_compute
-
-#endif /* __ARM_COMPUTE_CLDEPTHTOSPACE_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h
new file mode 100644
index 0000000..409eaf5
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLDIRECTTRANSPOSECONVLAYER_H__
+#define __ARM_COMPUTE_CLDIRECTTRANSPOSECONVLAYER_H__
+
+#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
+#include "arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h"
+#include "arm_compute/runtime/CL/functions/CLReverse.h"
+#include "arm_compute/runtime/CL/functions/CLTranspose.h"
+
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+class ICLTensor;
+/** Function to run the deconvolution layer.
+ *
+ * Deconvolution Layer is the backward pass of Convolution Layer. First we transform the input
+ * depending on the stride and pad info and then perform a 1x1
+ * convolution pass. Input stride defines how many zeroes we should put between each element of the
+ * input and pad is the amount of padding.
+ *
+ *  The relation between input to output is as follows:
+ *  \f[
+ *       width\_output = (width\_input - 1) \cdot stride\_x - 2 \cdot padding\_x + kernel\_x
+ *  \f]
+ *  \f[
+ *       height\_output = (height\_input - 1) \cdot stride\_y - 2 \cdot padding\_y + kernel\_y
+ *  \f]
+ *
+ *  where:
+ *      width_input is the size of the first input dimension.
+ *      height_input is the size of the second input dimension.
+ *      width_output is the size of the first output dimension.
+ *      height_output is the size of the second output dimension.
+ *      kernel_x and kernel_y are the convolution sizes in x and y.
+ *      stride_x and stride_y is the input stride of the first and second dimension.
+ *
+ * The weights used by Deconvolution are supposed to be the same as the ones used for Convolution.
+ * Therefore, it will be necessary to use the weights in the
+ * reverse order to perform an actual convolution. This is achieved by using @ref CLReverse.
+ *
+ * This function calls the following OpenCL kernels/functions:
+ *
+ * -# @ref CLDeconvolutionLayerUpsample
+ * -# @ref CLConvolutionLayer
+ *
+ * And the following CPP kernels:
+ * -# @ref CLReverse
+ *
+ */
+class CLDirectTransposeConvLayer : public IFunction
+{
+public:
+  /** Constructor */
+  CLDirectTransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLDirectTransposeConvLayer(const CLDirectTransposeConvLayer &) = delete;
+  /** Default move constructor */
+  CLDirectTransposeConvLayer(CLDirectTransposeConvLayer &&) = default;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLDirectTransposeConvLayer &operator=(const CLDirectTransposeConvLayer &) = delete;
+  /** Default move assignment operator */
+  CLDirectTransposeConvLayer &operator=(CLDirectTransposeConvLayer &&) = default;
+  /** Set the input, weights, biases and output tensors.
+   *
+   * @param[in,out] input        Input tensor. 3 lower dimensions represent a single input, and an
+ * optional 4th dimension for batch of inputs.
+   *                             Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
+   * @param[in]     weights      The 4d weights with dimensions [width, height, IFM, OFM]. Data type
+ * supported: Same as @p input.
+   * @param[in]     bias         (Optional) The biases have one dimension.
+   *                             Data type supported: Should match @p input data type, except for
+ * input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type
+   * @param[out]    output       Output tensor. The output has the same number of dimensions as the
+ * @p input.
+   * @param[in]     info         Contains padding and policies to be used in the deconvolution, this
+ * is decribed in @ref PadStrideInfo.
+ * @param[in] invalid_right  The number of zeros added to right edge of the output.
+ * @param[in] invalid_bottom  The number of zeros added to bottom edge of the output.
+   * @param[in]     weights_info (Optional) Weights information needed for @ref CLConvolutionLayer,
+ * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel.
+   *
+   */
+  void configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output,
+                 const PadStrideInfo &info, unsigned int invalid_right, unsigned int invalid_bottom,
+                 const WeightsInfo &weights_info = WeightsInfo());
+  /** Set the input, weights, biases and output tensors.
+   *
+   * @param[in]     compile_context The compile context to be used.
+   * @param[in,out] input           Input tensor. 3 lower dimensions represent a single input, and
+ * an optional 4th dimension for batch of inputs.
+   *                                Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
+   * @param[in]     weights         The 4d weights with dimensions [width, height, IFM, OFM]. Data
+ * type supported: Same as @p input.
+   * @param[in]     bias            (Optional) The biases have one dimension.
+   *                                Data type supported: Should match @p input data type, except for
+ * input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type
+   * @param[out]    output          Output tensor. The output has the same number of dimensions as
+ * the @p input.
+   * @param[in]     info            Contains padding and policies to be used in the deconvolution,
+ * this is decribed in @ref PadStrideInfo.
+ * @param[in] invalid_right  The number of zeros added to right edge of the output.
+ * @param[in] invalid_bottom  The number of zeros added to bottom edge of the output.
+   * @param[in]     weights_info    (Optional) Weights information needed for @ref
+ * CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref
+ * CLWeightsReshapeKernel.
+   *
+   */
+  void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights,
+                 const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info,
+                 unsigned int invalid_right, unsigned int invalid_bottom,
+                 const WeightsInfo &weights_info = WeightsInfo());
+  /** Static function to check if given info will lead to a valid configuration of @ref
+ * CLDirectTransposeConvLayer
+   *
+   * @param[in] input        Input tensor info. 3 lower dimensions represent a single input, and an
+ * optional 4th dimension for batch of inputs.
+   *                         Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
+   * @param[in] weights      The 4d weights info with dimensions [width, height, IFM, OFM]. Data
+ * type supported: Same as @p input.
+   * @param[in] bias         (Optional) The biases have one dimension.
+   *                         Data type supported: Should match @p input data type, except for input
+ * of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type
+   * @param[in] output       Output tensor info. The output has the same number of dimensions as the
+ * @p input.
+   * @param[in] info         Contains padding and policies to be used in the deconvolution, this is
+ * decribed in @ref PadStrideInfo.
+ * @param[in] invalid_right  The number of zeros added to right edge of the output.
+ * @param[in] invalid_bottom  The number of zeros added to bottom edge of the output.
+   * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer,
+ * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *weights,
+                         const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &info,
+                         unsigned int invalid_right, unsigned int invalid_bottom,
+                         const WeightsInfo &weights_info = WeightsInfo());
+
+  // Inherited methods overridden:
+  void run() override;
+  void prepare() override;
+
+private:
+  MemoryGroup _memory_group;
+  CLDeconvolutionLayerUpsample _scale_f;
+  CLConvolutionLayer _conv_f;
+  CLReverse _flip_weights;
+
+  CLTensor _scaled_output;
+  ICLTensor *_original_weights;
+  CLTensor _weights_flipped;
+  CLTensor _flip_axis;
+
+  bool _is_prepared;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLDIRECTTRANSPOSECONVLAYER_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h
index 1a0284a..f3266f6 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h
@@ -50,7 +50,7 @@
 #include "arm_compute/core/CL/kernels/CLTransposeKernel.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.h"
+#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
 
 namespace arm_compute
 {
@@ -168,7 +168,7 @@ private:
   CLFullyConnectedHybridLayerReshapeWeights _reshape_weights_kernel;
   CLScaleFactorSymm8Kernel _scale_factor_kernel;
   CLQuantizationSymmetricKernel _quant_input_kernel;
-  CLGEMMLowpMatrixMultiplyCoreEx _mm_gemmlowp;
+  CLGEMMLowpMatrixMultiplyCore _mm_gemmlowp;
   CLMultiplyScaleFactorKernel _multiply_scale_kernel;
   CLGEMMMatrixAccumulateBiasesKernel _accumulate_biases_kernel; // TODO(COMPMID-1889): Use CLGEMM to
                                                                 // add bias in
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.h
deleted file mode 100644
index 68aba74..0000000
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.h
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCOREEX_H__
-#define __ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCOREEX_H__
-
-#include "arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-
-namespace arm_compute
-{
-class IMemoryManager;
-class ICLTensor;
-
-/** Basic function to execute GEMMLowpMatrixMultiplyCore on OpenCL. This function calls the
- * following OpenCL kernels:
- *
- *  -# @ref CLGEMMLowpMatrixMultiplyKernel (if the parameter "reshape_b_only_on_first_run" of
- * GEMMInfo is FALSE)
- *  -# @ref CLGEMMLowpMatrixAReductionKernel (if the offset of matrix B is not 0)
- *  -# @ref CLGEMMLowpMatrixBReductionKernel (if the offset of matrix A is not 0)
- *
-*/
-class CLGEMMLowpMatrixMultiplyCoreEx : public IFunction
-{
-public:
-  /** Constructor */
-  CLGEMMLowpMatrixMultiplyCoreEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLGEMMLowpMatrixMultiplyCoreEx(const CLGEMMLowpMatrixMultiplyCoreEx &) = delete;
-  /** Default move constructor */
-  CLGEMMLowpMatrixMultiplyCoreEx(CLGEMMLowpMatrixMultiplyCoreEx &&) = default;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLGEMMLowpMatrixMultiplyCoreEx &operator=(const CLGEMMLowpMatrixMultiplyCoreEx &) = delete;
-  /** Default move assignment operator */
-  CLGEMMLowpMatrixMultiplyCoreEx &operator=(CLGEMMLowpMatrixMultiplyCoreEx &&) = default;
-  /** Initialise the kernel's inputs, output
-   *
-   * @note GEMMLowp:  low precision GEMM kernel. [A * B + C]
-   *  This kernel performs the following computations:
-   *
-   *  -# Convert a values from QASYMM8 to int32 and add a_offset to each of them.
-   *  -# Convert b values from QASYMM8 to int32 and add b_offset to each of them.
-   *  -# Compute the matrix product of the resulting a * b in int32.
-   *  -# Quantize to uint8 if gemm_info.gemmlowp_output_stage != NONE
-   *
-   * @param[in]  a         First input tensor  (Matrix A). Data type supported: QASYMM8.
-   * @param[in]  b         Second input tensor (Matrix B). Data type supported: same as @p a
-   * @param[in]  c         Third input tensor  (Matrix C). It can be a nullptr. Data type supported:
-   * S32
-   * @param[out] output    Output tensor. Data type supported: S32 or QASYMM8 if
-   * gemm_info.gemmlowp_output_stage != NONE
-   * @param[in]  gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped
-   * and
-   *                       if the reshape of matrix B should be executed only for the first run
-   */
-  void configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output,
-                 const GEMMInfo &gemm_info = GEMMInfo());
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * CLGEMMLowpMatrixMultiplyCoreEx
-   *
-   * @param[in] a         First input tensor info (Matrix A). Data type supported: QASYMM8.
-   * @param[in] b         Second input tensor info (Matrix B). Data type supported: same as @p a
-   * @param[in] c         Third input tensor info (Matrix C). It can be a nullptr. Data type
-   * supported: S32
-   * @param[in] output    Output tensor info. Data type supported: S32 or QASYMM8 if
-   * gemm_info.gemmlowp_output_stage != NONE
-   * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped
-   * and
-   *                      if the reshape of matrix B should be executed only for the first run
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c,
-                         const ITensorInfo *output, const GEMMInfo &gemm_info = GEMMInfo());
-
-  // Inherited methods overridden:
-  void run() override;
-  void prepare() override;
-
-private:
-  MemoryGroup _memory_group;
-
-  // Kernels used
-  CLGEMMLowpMatrixMultiplyKernelEx _mm_midgard_kernel;
-  CLGEMMLowpMatrixAReductionKernel _mtx_a_reduction_kernel;
-  CLGEMMLowpMatrixBReductionKernel _mtx_b_reduction_kernel;
-
-  // Temporary tensors
-  CLTensor _vector_sum_col;
-  CLTensor _vector_sum_row;
-
-  int32_t _a_offset;
-  int32_t _b_offset;
-  bool _reshape_b_only_on_first_run;
-  bool _is_prepared;
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCOREEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLLogicalNot.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLLogicalNot.h
deleted file mode 100644
index 5121671..0000000
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLLogicalNot.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_CLLOGICALNOT_H__
-#define __ARM_COMPUTE_CLLOGICALNOT_H__
-
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-class CLLogicalNot : public ICLSimpleFunction
-{
-public:
-  /** Initialise the function's source and destination.
-   *
-   * @param[in]  input  Source tensor. Data types supported: QASYMM8.
-   * @param[out] output Output tensor. Data types supported: QASYMM8.
-   */
-  void configure(ICLTensor *input, ICLTensor *output);
-};
-
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_CLLOGICALNOT_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPReLU.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPReLU.h
deleted file mode 100644
index 7fbe558..0000000
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPReLU.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_CLPRELU_H__
-#define __ARM_COMPUTE_CLPRELU_H__
-
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-class CLPReLU : public ICLSimpleFunction
-{
-public:
-  /** Initialise the function's source and destination.
-   *
-   * @param[in]  input. Data types supported:
-   * QASYMM8/F16/F32.
-   * @param[in]  alpha. Data types supported:
-   * QASYMM8/F16/F32.
-   * @param[out] output Output tensor. Data types supported: Same as @p input.
-   */
-  void configure(ICLTensor *input, ICLTensor *alpha, ICLTensor *output);
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_CLPRELU_H__*/
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h
deleted file mode 100644
index e83fb01..0000000
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/**
- * @file CLPixelWiseDivision.h
- * @ingroup COM_AI_RUNTIME
- * @brief This file contains arm_compute::CLPixelWiseDivision class
- */
-#ifndef __ARM_COMPUTE_CLPIXELWISEDIVISION_H__
-#define __ARM_COMPUTE_CLPIXELWISEDIVISION_H__
-
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/**
- * @brief Class to run @ref CLPixelWiseDivisionKernel.
- */
-class CLPixelWiseDivision : public ICLSimpleFunction
-{
-public:
-  /**
-   * @brief Initialise the kernel's inputs, output and convertion policy.
-   * @param[in, out] input1          An input tensor. Data types supported: U8/S16/F16/F32
-   *                                 The input tensor is [in, out] because its TensorInfo might be
-   * modified inside the kernel in case of broadcasting of dimension 0.
-   * @param[in, out] input2          An input tensor. Data types supported: same as @p input1.
-   *                                 The input tensor is [in, out] because its TensorInfo might be
-   * modified inside the kernel in case of broadcasting of dimension 0.
-   * @param[out]     output          The output tensor, Data types supported: same as @p input1.
-   * Note: U8 requires both inputs to be U8.
-   * @param[in]      scale           Scale to apply after multiplication.
-   *                                 Scale must be positive and its value must be either 1/255 or
-   * 1/2^n where n is between 0 and 15.
-   * @param[in]      overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
-   * @param[in]      rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest
-   * even.
-   * @return N/A
-   */
-  void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale = 1.f,
-                 ConvertPolicy overflow_policy = ConvertPolicy::WRAP,
-                 RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO);
-
-  /**
-   * @brief Static function to check if given info will lead to a valid configuration of @ref
-   * CLPixelWiseDivision
-   * @param[in] input1          An input tensor info. Data types supported: U8/S16/F16/F32
-   * @param[in] input2          An input tensor info. Data types supported: same as @p input1.
-   * @param[in] output          The output tensor info, Data types supported: same as @p input1.
-   * Note: U8 requires both inputs to be U8.
-   * @param[in] scale           Scale to apply after multiplication.
-   *                            Scale must be positive and its value must be either 1/255 or 1/2^n
-   * where n is between 0 and 15.
-   * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
-   * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input1, const ITensorInfo *input2,
-                         const ITensorInfo *output, float scale = 1.f,
-                         ConvertPolicy overflow_policy = ConvertPolicy::WRAP,
-                         RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO);
-};
-}
-#endif /*__ARM_COMPUTE_CLPIXELWISEDIVISION_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLRNNLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLRNNLayerEx.h
deleted file mode 100644
index b49cbd8..0000000
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLRNNLayerEx.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_CLRNN_LAYER_EX_H__
-#define __ARM_COMPUTE_CLRNN_LAYER_EX_H__
-
-#include "arm_compute/core/CL/kernels/CLActivationLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLCopyKernel.h"
-#include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
-#include "arm_compute/runtime/CL/functions/CLGEMM.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to run @ref CLRNNLayerEx */
-class CLRNNLayerEx : public IFunction
-{
-public:
-  /** Default constructor */
-  CLRNNLayerEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-  /** Initialize the function
-   *
-   * @param[in]     input             Input is a 2-D tensor of shape [input_size, batch_size]. Data
-   * types supported: F16/F32
-   * @param[in]     weights           Weights tensor of shape [input_size, num_units] that
-   * multiplies the input. Data types supported: Same as @p input
-   * @param[in]     recurrent_weights Weights tensor of shape [num_units, num_units] that multiplies
-   * the current 'state'. Data types supported: Same as @p input
-   * @param[in]     bias              Bias vector of shape [num_units]. Data types supported: Same
-   * as @p input
-   * @param[out]    output            Output tensor of shape [num_units, batch_size]. Data types
-   * supported: Same as @p input
-   * @param[in,out] hidden_state      Output tensor of shape [num_units, batch_size]. Data types
-   * supported: Same as @p input
-   * @param[in]     info              Activation layer parameter.
-   */
-  void configure(const ICLTensor *input, const ICLTensor *weights,
-                 const ICLTensor *recurrent_weights, const ICLTensor *bias, ICLTensor *hidden_state,
-                 ICLTensor *output, ActivationLayerInfo &info);
-  /** Initialize the function
-   *
-   * @param[in] input             Input is a 2-D tensor of shape [input_size, batch_size]. Data
-   * types supported: F16/F32
-   * @param[in] weights           Weights tensor of shape [input_size, num_units] that multiplies
-   * the input. Data types supported: Same as @p input
-   * @param[in] recurrent_weights Weights tensor of shape [num_units, num_units] that multiplies the
-   * current 'state'. Data types supported: Same as @p input
-   * @param[in] bias              Bias vector of shape [num_units]. Data types supported: Same as @p
-   * input
-   * @param[in] output            Output tensor of shape [num_units, batch_size]. Data types
-   * supported: Same as @p input
-   * @param[in] hidden_state      Output tensor of shape [num_units, batch_size]. Data types
-   * supported: Same as @p input
-   * @param[in] info              Activation layer parameter.
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *weights,
-                         const ITensorInfo *recurrent_weights, const ITensorInfo *bias,
-                         const ITensorInfo *hidden_state, const ITensorInfo *output,
-                         const ActivationLayerInfo &info);
-
-  // Inherited methods overridden:
-  void run() override;
-  void prepare() override;
-
-private:
-  MemoryGroup _memory_group;
-  CLGEMM _gemm_state_f;
-  CLSaturatedArithmeticOperationKernel _add_kernel;
-  CLActivationLayerKernel _activation_kernel;
-  CLFullyConnectedLayer _fully_connected_kernel;
-  CLCopyKernel _copy_kernel;
-  CLTensor _fully_connected_out;
-  CLTensor _gemm_output;
-  CLTensor _add_output;
-  bool _is_prepared;
-};
-}
-#endif /* __ARM_COMPUTE_CLRNN_LAYER_EX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h
deleted file mode 100644
index 2090b46..0000000
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_CLSPACETODEPTH_H__
-#define __ARM_COMPUTE_CLSPACETODEPTH_H__
-
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to run @ref CLSpaceToDepthKernel
- *
- * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32.
- * @note The function converts the input tensor to the tensor of the output tensor's type.
- */
-class CLSpaceToDepth : public ICLSimpleFunction
-{
-public:
-  /** Initialise the kernel's input and output.
-   *
-   * @param[in]  input              Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
-   * @param[out] output             Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
-   * @param[block_size] block size  integer only
-   */
-  void configure(ICLTensor *input, ICLTensor *output, const int32_t block_size);
-};
-
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_CLSPACETODEPTH_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSliceEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSliceEx.h
deleted file mode 100644
index 03edd15..0000000
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSliceEx.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/**
- * @file CLStridedSlice.h
- * @ingroup COM_AI_RUNTIME
- * @brief This file contains arm_compute::CLStridedSlice and arm_compute::CLStridedSliceCPU class
- */
-
-#ifndef __ARM_COMPUTE_CLSTRIDEDSLICEEX_H__
-#define __ARM_COMPUTE_CLSTRIDEDSLICEEX_H__
-
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/**
- * @brief Class to run @ref CLStridedSliceKernel
- */
-class CLStridedSliceEx : public ICLSimpleFunction
-{
-public:
-  /**
-   * @brief Initialise the kernel's inputs and outputs
-   * @param[in]  input   Tensor input. Data type supported:
-   *                     U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
-   * @param[out] output  Output tensor. Data type supported: Same as @p input
-   * @param[in]  beginData 'begin' vector of strided slice operation
-   * @param[in]  endData   'end' vector of strided slice operation
-   * @param[in]  stridesData 'strides' vector of strided slice operation
-   * @param[in]  beginMask  If the ith bit is set, begin[i] is ignored
-   * @param[in]  endMask    If the ith bit is set, end[i] is ignored
-   * @param[in]  shrinkAxisMask  If the ith bit is set, the ith specification shrinks the
-   *                             dimensionality by 1, taking on the value at index begin[i]
-   * @return N/A
-   */
-  void configure(const ICLTensor *input, ICLTensor *output, ICLTensor *beginData,
-                 ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask, int32_t endMask,
-                 int32_t shrinkAxisMask);
-};
-}
-#endif /*__ARM_COMPUTE_CLSTRIDEDSLICEEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h
index 54a697e..5fb102e 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h
@@ -15,7 +15,7 @@
  */
 
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,16 +37,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-
 #ifndef __ARM_COMPUTE_CLTRANSPOSECONVLAYER_H__
 #define __ARM_COMPUTE_CLTRANSPOSECONVLAYER_H__
 
-#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
-#include "arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h"
-
-#include "arm_compute/core/CPP/kernels/CPPFlipWeightsKernel.h"
-
-#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h"
+#include "arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 
@@ -54,119 +49,102 @@
 
 namespace arm_compute
 {
-class ICLTensor;
-/** Function to run the transpose convolution layer.
- *
- * @note This layer was copied in order to fix a bug computing to wrong output dimensions.
- *
- * TransposeConv Layer is the backward pass of Convolution Layer. First we transform the input
- * depending on the stride and pad info and then perform a 1x1
- * convolution pass. Input stride defines how many zeroes we should put between each element of the
- * input, pad is the amount of padding and finally a is a user
- * specified value where a < stride - 1, that increases the padding top and right of the input
- * image.
- *
- *  The relation between input to output is as follows:
- *  \f[
- *       width\_output = (width\_input - 1) \cdot stride\_x - \cdot padding\_x + kernel\_x
- *  \f]
- *  \f[
- *       height\_output = (height\_input - 1) \cdot stride\_y - \cdot padding\_y + kernel\_y
- *  \f]
- *
- *  where:
- *      width_input is the size of the first input dimension.
- *      height_input is the size of the second input dimension.
- *      width_output is the size of the first output dimension.
- *      height_output is the size of the second output dimension.
- *      kernel_x and kernel_y are the convolution sizes in x and y.
- *      stride_x and stride_y is the input stride of the first and second dimension.
- *
- * The weights used by Deconvolution are supposed to be the same as the ones used for Convolution.
- * Therefore, it will be necessary to use the weights in the
- * reverse order to perform an actual convolution. This is achieved by using the @ref
- * CPPFlipWeightsKernel.
- *
- * This function calls the following OpenCL kernels/functions:
- *
- * -# @ref CLTransposeConvLayerUpsample
- * -# @ref CLConvolutionLayer
+/** Basic function to compute the deconvolution layer. This function calls the following OpenCL
+ * kernels/functions:
  *
+ * -# @ref CLGEMMDeconvolutionLayer
+ * -# @ref CLDirectTransposeConvLayer
  */
 class CLTransposeConvLayer : public IFunction
 {
 public:
-  /** Constructor */
+  /** Default constructor */
   CLTransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLTransposeConvLayer(const CLTransposeConvLayer &) = delete;
-  /** Default move constructor */
-  CLTransposeConvLayer(CLTransposeConvLayer &&) = default;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLTransposeConvLayer &operator=(const CLTransposeConvLayer &) = delete;
-  /** Default move assignment operator */
-  CLTransposeConvLayer &operator=(CLTransposeConvLayer &&) = default;
+
   /** Set the input, weights, biases and output tensors.
    *
-   * @param[in,out] input          Input tensor. 3 lower dimensions represent a single input,
-   *                               and an optional 4th dimension for batch of inputs.
-   *                               Data types supported: QASYMM8/F16/F32.
-   * @param[in]     weights        The 4d weights with dimensions [width, height, IFM, OFM].
-   *                               Data type supported: Same as @p input.
-   * @param[in]     bias           (Optional) The biases have one dimension. Data type supported:
-   *                               Same as @p input.
-   * @param[out]    output         Output tensor. The output has the same number of dimensions
-   *                               as the @p input.
-   * @param[in]     info           Contains padding and policies to be used in the
-   *                               transpose convolution, this is decribed in @ref PadStrideInfo.
-   * @param[in]     invalid_right  The number of zeros added to right edge of the output.
-   * @param[in]     invalid_bottom The number of zeros added to top edge of the output.
-   * @param[in]     weights_info   (Optional) Weights information needed for @ref
-   *                               CLConvolutionLayer, specifies if the weights tensor has been
-   *                               reshaped with @ref CLWeightsReshapeKernel.
+   * @param[in,out] input        Input tensor. 3 lower dimensions represent a single input, and an
+ * optional 4th dimension for batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
+   * @param[in]     weights      The 4d weights with dimensions [width, height, IFM, OFM]. Data type
+ * supported: Same as @p input.
+   * @param[in]     bias         (Optional) The biases have one dimension. Data type supported: Same
+ * as @p input.
+   * @param[out]    output       Output tensor. The output has the same number of dimensions as the
+ * @p input.
+   * @param[in]     deconv_info  Contains padding and policies to be used in the deconvolution, this
+ * is described in @ref PadStrideInfo.
+ * @param[in] invalid_right  The number of zeros added to right edge of the output.
+ * @param[in] invalid_bottom  The number of zeros added to bottom edge of the output.
+   * @param[in]     weights_info (Optional) Weights information needed for @ref CLConvolutionLayer,
+ * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel.
+   *
    */
   void configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output,
-                 const PadStrideInfo &info, unsigned int invalid_right, unsigned int invalid_bottom,
+                 const PadStrideInfo &deconv_info, unsigned int invalid_right,
+                 unsigned int invalid_bottom, const WeightsInfo &weights_info = WeightsInfo());
+  /** Set the input, weights, biases and output tensors.
+   *
+   * @param[in]     compile_context The compile context to be used.
+   * @param[in,out] input           Input tensor. 3 lower dimensions represent a single input, and
+ * an optional 4th dimension for batch of inputs. Data types supported:
+ * QASYMM8_SIGNED/QASYMM8/F16/F32.
+   * @param[in]     weights         The 4d weights with dimensions [width, height, IFM, OFM]. Data
+ * type supported: Same as @p input.
+   * @param[in]     bias            (Optional) The biases have one dimension. Data type supported:
+ * Same as @p input.
+   * @param[out]    output          Output tensor. The output has the same number of dimensions as
+ * the @p input.
+   * @param[in]     deconv_info     Contains padding and policies to be used in the deconvolution,
+ * this is described in @ref PadStrideInfo.
+ * @param[in] invalid_right  The number of zeros added to right edge of the output.
+ * @param[in] invalid_bottom  The number of zeros added to bottom edge of the output.
+   * @param[in]     weights_info    (Optional) Weights information needed for @ref
+ * CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref
+ * CLWeightsReshapeKernel.
+   *
+   */
+  void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights,
+                 const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info,
+                 unsigned int invalid_right, unsigned int invalid_bottom,
                  const WeightsInfo &weights_info = WeightsInfo());
   /** Static function to check if given info will lead to a valid configuration of @ref
-   * CLTransposeConvLayer
+ * CLTransposeConvLayer
+   *
+   * @param[in] input        Input tensor info. 3 lower dimensions represent a single input, and an
+ * optional 4th dimension for batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
+   * @param[in] weights      The 4d weights info with dimensions [width, height, IFM, OFM]. Data
+ * type supported: Same as @p input.
+   * @param[in] bias         (Optional) The biases have one dimension. Data type supported: Same as
+ * @p input.
+   * @param[in] output       Output tensor info. The output has the same number of dimensions as the
+ * @p input.
+   * @param[in] deconv_info  Contains padding and policies to be used in the deconvolution, this is
+ * described in @ref PadStrideInfo.
+ * @param[in] invalid_right  The number of zeros added to right edge of the output.
+ * @param[in] invalid_bottom  The number of zeros added to bottom edge of the output.
+   * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer,
+ * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel.
    *
-   * @param[in] input           Input tensor info. 3 lower dimensions represent a single input,
-   *                            and an optional 4th dimension for batch of inputs.
-   *                            Data types supported: QASYMM8/F16/F32.
-   * @param[in] weights         The 4d weights info with dimensions [width, height, IFM, OFM].
-   *                            Data type supported: Same as @p input.
-   * @param[in] bias            (Optional) The biases have one dimension. Data type supported:
-   *                            Same as @p input.
-   * @param[in] output          Output tensor info. The output has the same number of dimensions
-   *                            as the @p input.
-   * @param[in] info            Contains padding and policies to be used in the
-   *                            transpose convolution, this is decribed in @ref PadStrideInfo.
-   * @param[in] innvalid_right  The number of zeros added to right edge of the output.
-   * @param[in] invalid_bottom  The number of zeros added to top edge of the output.
-   * @param[in] weights_info    (Optional) Weights information needed for @ref CLConvolutionLayer,
-   *                            specifies if the weights tensor has been reshaped with @ref
-   *                            CLWeightsReshapeKernel.
    * @return a status
    */
   static Status validate(const ITensorInfo *input, const ITensorInfo *weights,
-                         const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &info,
-                         unsigned int innvalid_right, unsigned int invalid_bottom,
+                         const ITensorInfo *bias, ITensorInfo *output,
+                         const PadStrideInfo &deconv_info, unsigned int invalid_right,
+                         unsigned int invalid_bottom,
                          const WeightsInfo &weights_info = WeightsInfo());
 
+  static DeconvolutionMethod
+  get_deconvolution_method(const ITensorInfo *input, const ITensorInfo *weights,
+                           const ITensorInfo *bias, ITensorInfo *output,
+                           const PadStrideInfo &deconv_info, unsigned int invalid_right,
+                           unsigned int invalid_bottom, const WeightsInfo &weights_info);
   // Inherited methods overridden:
   void run() override;
   void prepare() override;
 
 private:
-  MemoryGroup _memory_group;
-  CLTransposeConvLayerUpsample _scale_f;
-  CLConvolutionLayer _conv_f;
-  CPPFlipWeightsKernel _flip_weights;
-  CLTensor _scaled_output;
-  ICLTensor *_original_weights;
-  CLTensor _weights_flipped;
-  bool _is_prepared;
+  std::shared_ptr<IMemoryManager> _memory_manager;
+  std::unique_ptr<IFunction> _function;
 };
-}
+} // namespace arm_compute
 #endif /* __ARM_COMPUTE_CLTRANSPOSECONVLAYER_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h
deleted file mode 100644
index 7570fe7..0000000
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLE_H__
-#define __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLE_H__
-
-#include "arm_compute/runtime/IFunction.h"
-
-#include "arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to run @ref CLTransposeConvLayerUpsampleKernel */
-class CLTransposeConvLayerUpsample : public IFunction
-{
-public:
-  /** Default constructor */
-  CLTransposeConvLayerUpsample();
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLTransposeConvLayerUpsample(const CLTransposeConvLayerUpsample &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLTransposeConvLayerUpsample &operator=(const CLTransposeConvLayerUpsample &) = delete;
-  /** Allow instances of this class to be moved */
-  CLTransposeConvLayerUpsample(CLTransposeConvLayerUpsample &&) = default;
-  /** Allow instances of this class to be moved */
-  CLTransposeConvLayerUpsample &operator=(CLTransposeConvLayerUpsample &&) = default;
-  /** Default destructor */
-  virtual ~CLTransposeConvLayerUpsample() = default;
-
-  /** Initialize the function's source, destination, interpolation type and border_mode.
-   *
-   * @param[in, out] input        Source tensor. Data type supported: QASYMM8/F16/F32.
-   * @param[out]     output       Destination tensor. Data type supported: same as @p input.
-   * @param[in]      inner_border The number of zeros added to right and top edges of the input.
-   * @param[in]      info         Contains padding and policies to be used in the deconvolution.
-   */
-  void configure(ICLTensor *input, ICLTensor *output, const BorderSize &inner_border,
-                 const PadStrideInfo &info);
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * CLTransposeConvLayerUpsample
-   *
-   * @param[in] input        Source tensor info. Data type supported: QASYMM8/F16/F32.
-   * @param[in] output       Destination tensor info. Data type supported: same as @p input.
-   * @param[in] inner_border The number of zeros added to right and top edges of the input.
-   * @param[in] info         Contains padding and policies to be used in the deconvolution.
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                         const BorderSize &inner_border, const PadStrideInfo &info);
-
-  // Inherited methods overridden:
-  void run() override;
-
-private:
-  CLTransposeConvLayerUpsampleKernel _upsample;
-  ICLTensor *_output;
-};
-}
-#endif /* __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLE_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CPP/functions/CPPUpsampleEx.h b/compute/ARMComputeEx/arm_compute/runtime/CPP/functions/CPPUpsampleEx.h
deleted file mode 100644
index 666afef..0000000
--- a/compute/ARMComputeEx/arm_compute/runtime/CPP/functions/CPPUpsampleEx.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_CPPUPSAMPLE_EX_H__
-#define __ARM_COMPUTE_CPPUPSAMPLE_EX_H__
-
-#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
-
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to run @ref CPPUpsample */
-class CPPUpsampleEx : public ICPPSimpleFunction
-{
-public:
-  /** Configure the upsample CPP kernel
-   *
-   * @param[in]  input  The input tensor to upsample. Data types supported: F32/F16/QASYMM8
-   * @param[out] output The output tensor. Data types supported: Same as @p input
-   * @param[in]  info   Padding information
-   */
-  void configure(const ITensor *input, ITensor *output, const PadStrideInfo &info);
-};
-}
-#endif /* __ARM_COMPUTE_CPPUPSAMPLE_EX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h
index 49504fd..3fad230 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h
@@ -18,20 +18,13 @@
 
 #include <arm_compute/runtime/NEON/functions/NEActivationLayerEx.h>
 #include <arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h>
-#include <arm_compute/runtime/NEON/functions/NECast.h>
-#include <arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h>
 #include <arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h>
 #include <arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h>
 #include <arm_compute/runtime/NEON/functions/NEGatherEx.h>
 #include <arm_compute/runtime/NEON/functions/NEHashtableLookup.h>
 #include <arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h>
-#include <arm_compute/runtime/NEON/functions/NEPReLU.h>
-#include <arm_compute/runtime/NEON/functions/NEReduceMeanEx.h>
 #include <arm_compute/runtime/NEON/functions/NEReduceSum.h>
-#include <arm_compute/runtime/NEON/functions/NERNNLayerEx.h>
 #include <arm_compute/runtime/NEON/functions/NEReduceOperation.h>
-#include <arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h>
-#include <arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h>
 #include <arm_compute/runtime/NEON/functions/NETransposeConvLayer.h>
 
 #endif // __ARM_COMPUTE_NEFUNCTIONSEX_H__
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECast.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECast.h
deleted file mode 100644
index f0f0d81..0000000
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECast.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NECAST_H__
-#define __ARM_COMPUTE_NECAST_H__
-
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/TypesEx.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Basic function to run @ref NECastKernel that converts an input tensor to the other types */
-class NECast : public INESimpleFunctionNoBorder
-{
-public:
-  /** Configure the kernel.
-   *
-   * @param[in]  input  Source tensor. Data types supported: U8/S8/QASYMM8/U32/S32/F32.
-   * @param[out] output Destination tensor with the same dimensions of input. Data type supported:
-   * U8/S8/QASYMM8/U32/S32/F32.
-   * @param[in]  input_subtype  Sub data type of input.
-   */
-  void configure(const ITensor *input, ITensor *output,
-                 SubDataType input_subtype = SubDataType::NONE);
-  /** Static function to check if given info will lead to a valid configuration of @ref NECast
-   *
-   * @param[in] input  Input tensor info. Data types supported: U8/S8/QASYMM8/U32/S32/F32.
-   * @param[in] output Output tensor info. Data type supported: U8/S8/QASYMM8/U32/S32/F32.
-   * @param[in] input_subtype  Sub data type of input.
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                         SubDataType input_subtype = SubDataType::NONE);
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_NECAST_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h
deleted file mode 100644
index 005d85a..0000000
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NEDEPTHTOSPACELAYEREX_H__
-#define __ARM_COMPUTE_NEDEPTHTOSPACELAYEREX_H__
-
-#include "arm_compute/runtime/IFunction.h"
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to run @ref NEDepthToSpaceLayerKernelEx. */
-class NEDepthToSpaceLayerEx : public INESimpleFunctionNoBorder
-{
-public:
-  /** Set the input and output tensors.
-   *
-   * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported:
-   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
-   * @param[out] output      Tensor output. Data types supported: same as @p input
-   * @param[in]  block_shape Block shape value.
-   */
-  void configure(const ITensor *input, ITensor *output, int32_t block_shape);
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * NEDepthToSpaceLayerEx.
-   *
-   * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported:
-   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
-   * @param[in] output      Tensor output info. Data types supported: same as @p input
-   * @param[in] block_shape Block shape x value.
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_NEDEPTHTOSPACELAYEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayerEx.h
deleted file mode 100644
index 27a38e9..0000000
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayerEx.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NEELEMENTWISEUNARYLAYEREX_H__
-#define __ARM_COMPUTE_NEELEMENTWISEUNARYLAYEREX_H__
-
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to perform negative on an input tensor. */
-class NENegLayer : public INESimpleFunction
-{
-public:
-  /** Initialize the function
-   *
-   * @param[in]  input  Input tensor. Data types supported: F16/F32/S32.
-   * @param[out] output Output tensor. Data types supported: same as @p input.
-   */
-  void configure(const ITensor *input, ITensor *output);
-  /** Static function to check if given info will lead to a valid configuration of @ref NERsqrtLayer
-   *
-   * @param[in] input  First tensor input info. Data types supported: F16/F32/S32.
-   * @param[in] output Output tensor info. Data types supported: Same as @p input.
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_NEELEMENTWISEUNARYLAYEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h
index 39c57eb..56548a4 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h
@@ -46,7 +46,7 @@
 #include "arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h"
 #include "arm_compute/core/NEON/kernels/NETransposeKernel.h"
 #include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
 #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 #include "arm_compute/runtime/Tensor.h"
 
@@ -164,7 +164,7 @@ private:
   MemoryGroup _memory_group;
   NEFullyConnectedHybridLayerReshapeWeights _reshape_weights_function;
   NEQuantizationSymmetricKernel _quant_input_kernel;
-  NEGEMMLowpMatrixMultiplyCoreEx _mm_gemmlowp;
+  NEGEMMLowpMatrixMultiplyCore _mm_gemmlowp;
   NEMultiplyScaleFactorKernel _multiply_scale_kernel;
   NEGEMMMatrixAccumulateBiasesKernel _accumulate_biases_kernel;
   Tensor _reshape_weights_output;
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h
deleted file mode 100644
index d844513..0000000
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h
+++ /dev/null
@@ -1,170 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCOREEX_H__
-#define __ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCOREEX_H__
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-// #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
-#include "arm_compute/runtime/Tensor.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to execute GEMMLowpMatrixMultiplyCore on NEON. This function calls the following
- * NEON kernels if the DOT product instruction is not available:
- *
- *  -# @ref NEGEMMInterleave4x4Kernel
- *  -# @ref NEGEMMTranspose1xWKernel
- *  -# @ref NEGEMMLowpMatrixMultiplyKernel
- *  -# @ref NEGEMMLowpOffsetContributionKernel
- *  -# @ref NEActivationLayer
- *
- * otherwise if the DOT product instruction is available:
- *
- *  -# @ref NEGEMMLowpOffsetContributionKernel
- *
-*/
-class NEGEMMLowpMatrixMultiplyCoreEx : public IFunction
-{
-public:
-  /** Constructor */
-  NEGEMMLowpMatrixMultiplyCoreEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  NEGEMMLowpMatrixMultiplyCoreEx(const NEGEMMLowpMatrixMultiplyCoreEx &) = delete;
-  /** Default move constructor */
-  NEGEMMLowpMatrixMultiplyCoreEx(NEGEMMLowpMatrixMultiplyCoreEx &&) = default;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  NEGEMMLowpMatrixMultiplyCoreEx &operator=(const NEGEMMLowpMatrixMultiplyCoreEx &) = delete;
-  /** Default move assignment operator */
-  NEGEMMLowpMatrixMultiplyCoreEx &operator=(NEGEMMLowpMatrixMultiplyCoreEx &&) = default;
-  /** Initialise the kernel's inputs, output
-   *
-   * @note GEMM_LOWP:  low precision GEMM kernel
-   *  This kernel performs the following computations:
-   *
-   *  -# Convert a values from QASYMM8 to int32 and add a_offset to each of them.
-   *  -# Convert b values from QASYMM8 to int32 add b_offset to each of them.
-   *  -# Compute the matrix product of the resulting a * b in int32.
-   *
-   * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is
-   * QASYMM8/QASYMM8_SIGNED otherwise
-   *
-   * @param[in]  a         First input tensor  (Matrix A). Data type supported:
-   * QASYMM8/QASYMM8_SIGNED.
-   * @param[in]  b         Second input tensor (Matrix B). Data type supported: same as @p a
-   * @param[in]  c         Third input tensor  (Matrix C). It can be a nullptr. Data type supported:
-   * S32
-   * @param[out] output    Output tensor. Data type supported: Data type supported:
-   * S32/QASYMM8/QASYMM8_SIGNED
-   * @param[in]  gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped
-   * and
-   *                       if the reshape of matrix B should be executed only for the first run
-   */
-  void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output,
-                 const GEMMInfo &gemm_info = GEMMInfo());
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * NEGEMMLowpMatrixMultiplyCoreEx
-   *
-   * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is
-   * QASYMM8/QASYMM8_SIGNED otherwise
-   *
-   * @param[in] a         First input tensor info  (Matrix A). Data type supported:
-   * QASYMM8/QASYMM8_SIGNED.
-   * @param[in] b         Second input tensor info (Matrix B). Data type supported: same as @p a
-   * @param[in] c         Third input tensor  info (Matrix C). It can be a nullptr. Data type
-   * supported: S32
-   * @param[in] output    Output tensor info. Data type supported: Data type supported:
-   * S32/QASYMM8/QASYMM8_SIGNED
-   * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped
-   * and
-   *                      if the reshape of matrix B should be executed only for the first run
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c,
-                         const ITensorInfo *output, const GEMMInfo &gemm_info = GEMMInfo());
-
-  // Inherited methods overridden
-  void run() override;
-  void prepare() override;
-
-private:
-  MemoryGroup _memory_group;
-  NEGEMMAssemblyDispatch _asm_glue;
-  std::unique_ptr<INEKernel> _mm_kernel;
-  std::unique_ptr<INEKernel> _mtx_a_reshape_kernel;
-  std::unique_ptr<INEKernel> _mtx_b_reshape_kernel;
-  NEGEMMLowpMatrixAReductionKernel _mtx_a_reduction_kernel;
-  NEGEMMLowpMatrixBReductionKernel _mtx_b_reduction_kernel;
-  NEGEMMLowpOffsetContributionKernel _offset_contribution_kernel;
-  NEGEMMLowpOffsetContributionOutputStageKernel _offset_contribution_output_stage_kernel;
-
-  Tensor _vector_sum_col;
-  Tensor _vector_sum_row;
-  Tensor _tmp_a;
-  Tensor _tmp_b;
-  Tensor _mm_result_s32;
-  Tensor _signed_a;
-  Tensor _signed_output;
-  const ITensor *_original_b;
-  int32_t _a_offset;
-  int32_t _b_offset;
-
-  bool _run_vector_matrix_multiplication;
-  bool _assembly_path;
-  bool _fused_assembly_path;
-  bool _reshape_b_only_on_first_run;
-  bool _is_prepared;
-  bool _fuse_output_stage;
-  bool _flip_signedness;
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCOREEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEPReLU.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEPReLU.h
deleted file mode 100644
index ca84133..0000000
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEPReLU.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NEPRELU_H__
-#define __ARM_COMPUTE_NEPRELU_H__
-
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to run @ref NEPReLUKernel */
-class NEPReLU : public INESimpleFunctionNoBorder
-{
-public:
-  /** Initialise the kernel's inputs and output
-   *
-   * @param[in]  input. Data types supported: QASYMM8/F32.
-   * @param[in]  alpha. Data types supported: Same as @p input.
-   * @param[out] output Output tensor. Data types supported: Same as @p input.
-   */
-  void configure(const ITensor *input, const ITensor *alpha, ITensor *output);
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_NEPRELU_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NERNNLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NERNNLayerEx.h
deleted file mode 100644
index 8a7b179..0000000
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NERNNLayerEx.h
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NERNNLAYER_EX_H__
-#define __ARM_COMPUTE_NERNNLAYER_EX_H__
-
-#include "arm_compute/core/NEON/kernels/NEActivationLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h"
-#include "arm_compute/core/NEON/kernels/NECopyKernel.h"
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMM.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Basic function to run @ref NERNNLayerEx */
-class NERNNLayerEx : public IFunction
-{
-public:
-  /** Default constructor */
-  NERNNLayerEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  NERNNLayerEx(const NERNNLayerEx &) = delete;
-  /** Default move constructor */
-  NERNNLayerEx(NERNNLayerEx &&) = default;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  NERNNLayerEx &operator=(const NERNNLayerEx &) = delete;
-  /** Default move assignment operator */
-  NERNNLayerEx &operator=(NERNNLayerEx &&) = default;
-  /** Initialize the function
-   *
-   * @param[in]     input             Input is a 2-D tensor of shape [input_size, batch_size]. Data
-   * types supported: F16/F32
-   * @param[in]     weights           Weights tensor of shape [input_size, num_units] that
-   * multiplies the input. Data types supported: Same as @p input
-   * @param[in]     recurrent_weights Weights tensor of shape [num_units, num_units] that multiplies
-   * the current 'state'. Data types supported: Same as @p input
-   * @param[in]     bias              Bias vector of shape [num_units]. Data types supported: Same
-   * as @p input
-   * @param[out]    output            Output tensor of shape [num_units, batch_size]. Data types
-   * supported: Same as @p input
-   * @param[in,out] hidden_state      Output tensor of shape [num_units, batch_size]. Data types
-   * supported: Same as @p input
-   * @param[in]     info              Activation layer parameter.
-   */
-  void configure(const ITensor *input, const ITensor *weights, const ITensor *recurrent_weights,
-                 const ITensor *bias, ITensor *hidden_state, ITensor *output,
-                 ActivationLayerInfo &info);
-  /** Initialize the function
-   *
-   * @param[in] input             Input is a 2-D tensor of shape [input_size, batch_size]. Data
-   * types supported: F16/F32
-   * @param[in] weights           Weights tensor of shape [input_size, num_units] that multiplies
-   * the input. Data types supported: Same as @p input
-   * @param[in] recurrent_weights Weights tensor of shape [num_units, num_units] that multiplies the
-   * current 'state'. Data types supported: Same as @p input
-   * @param[in] bias              Bias vector of shape [num_units]. Data types supported: Same as @p
-   * input
-   * @param[in] output            Output tensor of shape [num_units, batch_size]. Data types
-   * supported: Same as @p input
-   * @param[in] hidden_state      Output tensor of shape [num_units, batch_size]. Data types
-   * supported: Same as @p input
-   * @param[in] info              Activation layer parameter.
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *weights,
-                         const ITensorInfo *recurrent_weights, const ITensorInfo *bias,
-                         const ITensorInfo *hidden_state, const ITensorInfo *output,
-                         const ActivationLayerInfo &info);
-
-  // Inherited methods overridden:
-  void run() override;
-  void prepare() override;
-
-private:
-  MemoryGroup _memory_group;
-  NEGEMM _gemm_state_f;
-  NEArithmeticAdditionKernel _add_kernel;
-  NEActivationLayerKernel _activation_kernel;
-  NEFullyConnectedLayer _fully_connected_kernel;
-  NECopyKernel _copy_kernel;
-  Tensor _fully_connected_out;
-  Tensor _gemm_output;
-  Tensor _add_output;
-  bool _is_prepared;
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_NERNNLAYER_EX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceMeanEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceMeanEx.h
deleted file mode 100644
index 03ac457..0000000
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceMeanEx.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NEON_REDUCE_MEAN_EX_H__
-#define __ARM_COMPUTE_NEON_REDUCE_MEAN_EX_H__
-
-#include "arm_compute/runtime/IFunction.h"
-
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
-#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to perform reduce operation */
-class NEReduceMeanEx : public IFunction
-{
-public:
-  /** Constructor */
-  NEReduceMeanEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-  /** Configure kernel
-   *
-   * @note Supported tensor rank: up to 4
-   *
-   * @param[in]  input          Source tensor. Data type supported: QASYMM8/F16/F32
-   * @param[in]  reduction_axis Reduction axis vector.
-   * @param[in]  keep_dims      If positive, retains reduced dimensions with length 1.
-   * @param[out] output         Destination tensor. Data type supported: Same as @p input
-   */
-  void configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims,
-                 ITensor *output);
-
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * NEReduceMeanEx
-   *
-   * @param[in] input          Source tensor. Data type supported: QASYMM8/F16/F32
-   * @param[in] reduction_axis Reduction axis vector.
-   * @param[in] keep_dims      If positive, retains reduced dimensions with length 1.
-   * @param[in] output         Destination tensor. Data type supported: Same as @p input
-   *
-   * @return A status
-   */
-  static Status validate(const ITensorInfo *input, const Coordinates &reduction_axis,
-                         bool keep_dims, const ITensorInfo *output);
-
-  // Inherited methods overridden:
-  void run() override;
-
-private:
-  MemoryGroup _memory_group;
-  std::unique_ptr<NEReductionOperation[]> _reduction_kernels{nullptr};
-  std::unique_ptr<Tensor[]> _reduced_outs{nullptr};
-  NEReshapeLayer _reshape;
-  unsigned int _reduction_ops;
-  bool _keep_dims;
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_NEON_REDUCE_MEAN_EX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h
deleted file mode 100644
index 3b695fb..0000000
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NESPACETOBATCHLAYEREX_H__
-#define __ARM_COMPUTE_NESPACETOBATCHLAYEREX_H__
-
-#include "arm_compute/runtime/IFunction.h"
-
-#include "arm_compute/core/NEON/kernels/NEMemsetKernel.h"
-#include "arm_compute/core/NEON/kernels/NESpaceToBatchLayerKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to spatial divide a tensor. This function calls the following NEON
- * kernels/functions:
- *
- *  -# @ref NEMemsetKernel
- *  -# @ref NESpaceToBatchLayerKernel
- */
-class NESpaceToBatchLayerEx : public IFunction
-{
-public:
-  /** Default constructor */
-  NESpaceToBatchLayerEx();
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  NESpaceToBatchLayerEx(const NESpaceToBatchLayerEx &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  NESpaceToBatchLayerEx &operator=(const NESpaceToBatchLayerEx &) = delete;
-  /** Allow instances of this class to be moved */
-  NESpaceToBatchLayerEx(NESpaceToBatchLayerEx &&) = default;
-  /** Allow instances of this class to be moved */
-  NESpaceToBatchLayerEx &operator=(NESpaceToBatchLayerEx &&) = default;
-  /** Default destructor */
-  virtual ~NESpaceToBatchLayerEx() = default;
-  /** Set the input and output tensors.
-   *
-   * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported:
-   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
-   * @param[in]  block_shape 1-D tensor with shape [M]. Data types supported: S32
-   * @param[in]  paddings    2-D tensor with shape [2, M]. Data types supported: S32
-   * @param[out] output      Tensor output. Data types supported: same as @p input
-   */
-  void configure(const ITensor *input, const ITensor *block_shape, const ITensor *paddings,
-                 ITensor *output);
-  /** Set the input and output tensors. (Static block shape and paddings)
-   *
-   * @param[in]  input         Tensor input. Supported tensor rank: 4. Data types supported:
-   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
-   * @param[in]  block_shape_x Block shape x value.
-   * @param[in]  block_shape_y Block shape y value.
-   * @param[in]  padding_left  The left padding of the output tensor.
-   * @param[in]  padding_right The right padding of the output tensor.
-   * @param[out] output        Tensor output. Data types supported: same as @p input
-   */
-  void configure(const ITensor *input, const int block_shape_x, const int block_shape_y,
-                 const Size2D &padding_left, const Size2D &padding_right, ITensor *output);
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * NESpaceToBatchLayerEx
-   *
-   * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported:
-   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
-   * @param[in] block_shape block shape tensor info with shape [M]. Data types supported: S32
-   * @param[in] paddings    paddings tensor info with shape [2, M]. Data types supported: S32
-   * @param[in] output      Tensor output info. Data types supported: same as @p input
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape,
-                         const ITensorInfo *paddings, const ITensorInfo *output);
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * NESpaceToBatchLayerEx (Static block shape and paddings)
-   *
-   * @param[in] input         Tensor input info. Supported tensor rank: 4. Data types supported:
-   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
-   * @param[in] block_shape_x Block shape x value.
-   * @param[in] block_shape_y Block shape y value.
-   * @param[in] padding_left  The left padding of the output tensor.
-   * @param[in] padding_right The right padding of the output tensor.
-   * @param[in] output        Tensor output info. Data types supported: same as @p input
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y,
-                         const Size2D &padding_left, const Size2D &padding_right,
-                         const ITensorInfo *output);
-
-  // Inherited methods overridden:
-  void run() override;
-
-private:
-  NESpaceToBatchLayerKernel _space_to_batch_kernel; /**< SpaceToBatch kernel to run */
-  NEMemsetKernel _memset_kernel;                    /**< Memset kernel to run */
-  bool _has_padding;                                /**< Flag to check if the output has padding */
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_NESPACETOBATCHLAYEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h
deleted file mode 100644
index 9f32616..0000000
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NESPACETODEPTHLAYEREX_H__
-#define __ARM_COMPUTE_NESPACETODEPTHLAYEREX_H__
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** This function calls the following NEON kernels/functions:
- *
- *  -# @ref NESpaceToDepthLayerKernelEx
- */
-class NESpaceToDepthLayerEx : public INESimpleFunctionNoBorder
-{
-public:
-  /** Set the input and output tensors.
-   *
-   * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported:
-   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
-   * @param[out] output      Tensor output. Data types supported: same as @p input
-   * @param[in]  block_shape Block shape value
-   */
-  void configure(const ITensor *input, ITensor *output, int32_t block_shape);
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * NESpaceToDepthLayerEx (Static block shape and paddings)
-   *
-   * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported:
-   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
-   * @param[in] output      Tensor output info. Data types supported: same as @p input
-   * @param[in] block_shape Block shape value
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_NESPACETODEPTHLAYEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h
index 408d150..24ff5da 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h
@@ -15,7 +15,7 @@
  */
 
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,16 +37,14 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-
 #ifndef __ARM_COMPUTE_NETRANSPOSECONVLAYER_H__
 #define __ARM_COMPUTE_NETRANSPOSECONVLAYER_H__
 
-#include "arm_compute/runtime/CPP/functions/CPPUpsampleEx.h"
+#include "arm_compute/runtime/CPP/functions/CPPUpsample.h"
 #include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEPermute.h"
+#include "arm_compute/runtime/NEON/functions/NEReverse.h"
 
-#include "arm_compute/core/CPP/kernels/CPPFlipWeightsKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
@@ -59,8 +57,8 @@ namespace arm_compute
 {
 /** Function to run the deconvolution layer.
  *
- * Transpose convolution Layer is the backward pass of Convolution Layer. First we transform the
- * input depending on the stride and pad info and then perfrom a 1x1
+ * Deconvolution Layer is the backward pass of Convolution Layer. First we transform the input
+ * depending on the stride and pad info and then perfrom a 1x1
  * convolution pass. Input stride defines how many zeroes we should put between each element of the
  * input, pad is the amount of padding and finaly a is a user
  * specified value where a < stride - 1 that increases the padding top and right of the input image.
@@ -81,21 +79,22 @@ namespace arm_compute
  *      kernel_x and kernel_y are the convolution sizes in x and y.
  *      stride_x and stride_y is the input stride of the first and second dimension.
  *
- * The weights used by Transpose convolution are supposed to be the same as the ones used for
- * Convolution. Therefore, it will be necessary to use the weights in the
- * reverse order to perform an actual convolution. This is achieved by using the @ref
- * CPPFlipWeightsKernel.
+ * The weights used by Deconvolution are supposed to be the same as the ones used for Convolution.
+ * Therefore, it will be necessary to use the weights in the
+ * reverse order to perform an actual convolution. This is achieved by using @ref NEReverse.
  *
  * This function calls the following NEON kernels/functions:
  *
- * -# @ref CPPUpsample
+ * -# @ref CPPUpsampleEx
  * -# @ref NEConvolutionLayer
+ * -# @ref NEPermute
+ * -# @ref NEReverse
  *
  */
 class NETransposeConvLayer : public IFunction
 {
 public:
-  /** Default constructor */
+  /** Constructor */
   NETransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
 
   /** Prevent instances of this class from being copied (As this class contains pointers) */
@@ -112,37 +111,38 @@ public:
   /** Set the input, weights, biases and output tensors.
    *
    * @param[in,out] input   Input tensor. 3 lower dimensions represent a single input, and an
-   * optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8.
+ * optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED.
    * @param[in]     weights The 4d weights with dimensions [width, height, IFM, OFM]. Data type
-   * supported: Same as @p input.
+ * supported: Same as @p input.
    * @param[in]     bias    Optional, ignored if NULL. The biases have one dimension. Data type
-   * supported: Data types supported: S32 for QASYMM8 input, F32 for F32 input, F16 for F16 input.
+ * supported: Data types supported: S32 for QASYMM8 and QASYMM8_SIGNED input, F32 for F32 input, F16
+ * for F16 input.
    * @param[out]    output  Output tensor. The output has the same number of dimensions as the @p
-   * input.
+ * input.
    * @param[in]     info    Contains padding and policies to be used in the deconvolution, this is
-   * decribed in @ref PadStrideInfo.
-   * @param[in]     invalid_right  The number of zeros added to right edge of the output.
-   * @param[in]     invalid_bottom The number of zeros added to top edge of the output.
+ * decribed in @ref PadStrideInfo.
+ * @param[in]     invalid_right  The number of zeros added to right edge of the output.
+ * @param[in]     invalid_bottom The number of zeros added to bottom edge of the output.
    *
    */
   void configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output,
                  const PadStrideInfo &info, unsigned int invalid_right,
                  unsigned int invalid_bottom);
   /** Static function to check if given info will lead to a valid configuration of @ref
-   * NETransposeConvLayer
+ * NETransposeConvLayer
    *
    * @param[in] input   Input tensor info. 3 lower dimensions represent a single input, and an
-   * optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8.
+ * optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED.
    * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. Data type
-   * supported: Same as @p input.
+ * supported: Same as @p input.
    * @param[in] bias    (Optional) The biases have one dimension. Data type supported: Data types
-   * supported: S32 for QASYMM8 input, F32 for F32 input, F16 for F16 input.
+ * supported: S32 for QASYMM8 and QASYMM8_SIGNED input, F32 for F32 input, F16 for F16 input.
    * @param[in] output  Output tensor info. The output has the same number of dimensions as the @p
-   * input.
+ * input.
    * @param[in] info    Contains padding and policies to be used in the deconvolution, this is
-   * decribed in @ref PadStrideInfo.
-   * @param[in] innvalid_right  The number of zeros added to right edge of the output.
-   * @param[in] invalid_bottom  The number of zeros added to top edge of the output.
+ * decribed in @ref PadStrideInfo.
+ * @param[in] innvalid_right  The number of zeros added to right edge of the output.
+ * @param[in] invalid_bottom  The number of zeros added to bottom edge of the output.
    *
    * @return a status
    */
@@ -158,17 +158,11 @@ public:
 private:
   MemoryGroup _memory_group;
   NEConvolutionLayer _conv_f;
-  CPPUpsampleEx _upsample_f;
-  CPPFlipWeightsKernel _flip_weights;
-  NEPermute _permute_input;
-  NEPermute _permute_weights;
-  NEPermute _permute_output;
+  CPPUpsample _upsample_f;
+  NEReverse _flip_weights;
   Tensor _scaled_output;
   Tensor _weights_flipped;
-  Tensor _permuted_input;
-  Tensor _permuted_weights;
-  Tensor _permuted_output;
-  bool _is_nchw;
+  Tensor _flip_axis;
   const ITensor *_original_weights;
   ITensor *_input;
   PadStrideInfo _info;
diff --git a/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
index 7b6b974..ba42a24 100644
--- a/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
+++ b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
@@ -55,16 +55,7 @@ using namespace arm_compute;
 
 const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map = {
     // ARMComputeEx kernels
-    {"arg_op", "arg_operation.cl"},
-    {"arithmetic_add_qasymm8", "arithmetic_op_quantized.cl"},
     {"binary_logical_op", "binary_logical_op.cl"},
-    {"cast", "cast.cl"},
-    {"cast_qasymm_in", "cast.cl"},
-    {"cast_qasymm_out", "cast.cl"},
-    {"comparison_op", "comparison_op.cl"},
-    {"comparison_op_qasymm8", "comparison_op_quantized.cl"},
-    {"depth_to_space_nchw", "depth_to_space.cl"},
-    {"depth_to_space_nhwc", "depth_to_space.cl"},
     {"embedding_lookup", "embedding_lookup.cl"},
     {"gather_ex", "gather_ex.cl"},
     {"gather_ex_1d", "gather_ex.cl"},
@@ -74,10 +65,6 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map
     {"instance_normalization_ex", "instance_normalization_ex.cl"},
     {"multiply_scale_factor", "multiply_scale_factor.cl"},
     {"neg_tensor", "neg_tensor.cl"},
-    {"permute_generic", "permute_ex.cl"},
-    {"pixelwise_mul_qasymm8", "pixelwise_mul_quantized.cl"},
-    {"prelu", "prelu.cl"},
-    {"prelu_qasymm8", "prelu_quantized.cl"},
     {"quantization_symm8", "quantization_symm8.cl"},
     {"reduce_min_max", "reduce_operation.cl"},
     {"reduce_sum_mean", "reduce_operation.cl"},
@@ -91,29 +78,15 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map
     {"radixsort_reorder", "topkv2_radixsort.cl"},
     {"topkv2_quicksort", "topkv2_quicksort.cl"},
     {"scale_factor_symm8", "scale_factor.cl"},
-    {"space_to_depth_nchw", "space_to_depth.cl"},
-    {"space_to_depth_nhwc", "space_to_depth.cl"},
 };
 
 const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map = {
 #ifdef EMBEDDED_KERNELS
     {
-        "arg_operation.cl",
-#include "./cl_kernels/arg_operation.clembed"
-    },
-    {
-        "cast.cl",
-#include "./cl_kernels/cast.clembed"
-    },
-    {
         "embedding_lookup.cl",
 #include "./cl_kernels/embedding_lookup.clembed"
     },
     {
-        "depth_to_space.cl",
-#include "./cl_kernels/depth_to_space.clembed"
-    },
-    {
         "gather_ex.cl",
 #include "./cl_kernels/gather_ex.clembed"
     },
@@ -150,14 +123,6 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map
 #include "./cl_kernels/neg_tensor.clembed"
     },
     {
-        "prelu.cl",
-#include "./cl_kernels/prelu.clembed"
-    },
-    {
-        "prelu_quantized.cl",
-#include "./cl_kernels/prelu_quantized.clembed"
-    },
-    {
         "quantization_symm8.cl",
 #include "./cl_kernels/quantization_symm8.clembed"
     },
@@ -170,10 +135,6 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map
 #include "./cl_kernels/scale_factor.clembed"
     },
     {
-        "space_to_depth.cl",
-#include "./cl_kernels/space_to_depth.clembed"
-    },
-    {
         "topkv2.cl",
 #include "./cl_kernels/topkv2.clembed"
     },
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl
deleted file mode 100644
index 03717cf..0000000
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE)
-/** Perform arg_max/arg_min
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type.
- *       e.g. -DDATA_TYPE=short
- * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size.
- *            e.g. -DDEPTH_OUT=16
- * @attention Operation type(code) specifying which operation to perform should be passed as
- *            preprocessor argument using -DOP_CODE = number. e.g. -DOP_CODE=1
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data
- * types:
- *                                                  U8/QASYMM8/S8/U16/S16/F16/U32/S32/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension
- *                                                  (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X
- *                                                  processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension
- *                                                  (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y
- *                                                  processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension
- *                                                  (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z
- *                                                  processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element
- *                                                  in the source image
- * @param[in]  input_stride_w                       Stride of the source tensor in W dimension
- *                                                  (in bytes)
- * @param[in]  input_step_w                         output_stride_w * number of elements along W
- *                                                  processed per workitem(in bytes)
- * @param[out] output_ptr                           Pointer to the destination image.
- *                                                  Supported data types: U32
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension
- *                                                  (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension
- *                                                  (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension
- *                                                  (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_w                      Stride of the source tensor in W dimension
- *                                                  (in bytes)
- * @param[in]  output_step_w                        output_stride_w * number of elements along W
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
- *                                                  destination image
- * @param[in]  axis                                 Axis through which reduction occurs
- * @param[in]  dim                                  Dimension across the axis to be reduced.
- */
-
-__kernel void arg_op(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output), const int axis,
-                     const int dim)
-{
-  Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
-  Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
-
-  int indices[4] = {
-      get_global_id(0), get_global_id(1), get_global_id(2) % DEPTH_OUT,
-      get_global_id(2) / DEPTH_OUT,
-  };
-
-  DATA_TYPE value =
-      *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]));
-  DATA_TYPE tval = value;
-  int idx = 0;
-  for (int i = 1; i < dim; ++i)
-  {
-    indices[axis] = i;
-
-#if OP_CODE == 1 // ArgMax
-    value = max(value, *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1],
-                                                               indices[2], indices[3])));
-#elif OP_CODE == 2 // ArgMin
-    value = min(value, *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1],
-                                                               indices[2], indices[3])));
-#else
-    return;
-
-#endif
-
-    if (tval != value)
-    {
-      idx = indices[axis];
-      tval = value;
-    }
-  }
-
-  *((__global uint *)out.ptr) = idx;
-}
-#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl
deleted file mode 100644
index f74c1c1..0000000
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl
+++ /dev/null
@@ -1,191 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "helpers_asymm.h"
-
-#ifdef SATURATE
-#define ADD(x, y) add_sat((x), (y))
-#define SUB(x, y) sub_sat((x), (y))
-#else /* SATURATE */
-#define ADD(x, y) (x) + (y)
-#define SUB(x, y) (x) - (y)
-#endif /* SATURATE */
-
-/** Performs a pixelwise addition used to quantize down the int32 accumulator values of GEMMLowp to
- *  QASYMM8
- *
- * The following computations will be performed:
- *
- *  -# Add offset terms to inputs
-    -# Get scaled value of two inputs
- *  -# Add inputs
- *  -# Add offset terms to final result
- *  -# Multiply each entry of result by result_mult_int
- *  -# Shift the int32 accumulator by result_shift
- *  -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
- *
- * @attention The inputs and output data types need to be passed at compile time using
- *            -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
- *            e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar
- * @attention The number of bits to shift left of input tensors must be passed at compile time using
- *            -DLEFT_SHIFT
- * @attention The offset, scalar scale factor and number of bits to shift right of input tensors
- *            must be passed at compile time using -DIN1_OFFSET, -RIN1_MULT_INT, -DIN1_SHIFT,
- -DIN2_OFFSET,
- *            -RIN2_MULT_INT and -DIN2_SHIFT
- * @attention The offset, scalar scale factor and number of bits to shift right of output tensor
- *            must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and
- -DRESULT_SHIFT
- *
- * @attention The input and output data_types need to be passed at compile time using
- *            -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
- *            e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar
- * @attention The inputs and output scale information of qasymm8 need to be passed at compile time
- *            using -DSCALE_IN1, -DSCALE_IN2 and -DSCALE_OUT:
- *            e.g. -DSCALE_IN1=1.f -DSCALE_IN2=1.f -DSCALE_OUT=2.f
- * @attention The inputs and output scale offset need to be passed at compile time using
- *            -DOFFSET_IN1, -DOFFSET_IN2 and -DOFFSET_OUT:
- *            e.g. -DOFFSET_IN1=0 -DOFFSET_IN2=0 -DOFFSET_OUT=0
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
- *            -DVEC_SIZE=16
- * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise
- *            wrapping policy will be used.
- *
- * @param[in]  in1_ptr                           Pointer to the source tensor.
- *                                               Supported data types: QASYMM8
- * @param[in]  in1_stride_x                      Stride of the source tensor in X dimension
- *                                               (in bytes)
- * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed
- *                                               per workitem(in bytes)
- * @param[in]  in1_stride_y                      Stride of the source tensor in Y dimension
- *                                               (in bytes)
- * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed
- *                                               per workitem(in bytes)
- * @param[in]  in1_stride_z                      Stride of the source tensor in Z dimension
- *                                               (in bytes)
- * @param[in]  in1_step_z                        in1_stride_z * number of elements along Z processed
- *                                               per workitem(in bytes)
- * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source
- *                                               tensor
- * @param[in]  in2_ptr                           Pointer to the source tensor. Supported data types:
- *                                               QASYMM8
- * @param[in]  in2_stride_x                      Stride of the source tensor in X dimension
- *                                               (in bytes)
- * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed
- *                                               per workitem(in bytes)
- * @param[in]  in2_stride_y                      Stride of the source tensor in Y dimension
- *                                               (in bytes)
- * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed
- *                                               per workitem(in bytes)
- * @param[in]  in2_stride_z                      Stride of the source tensor in Z dimension
- *                                               (in bytes)
- * @param[in]  in2_step_z                        in2_stride_z * number of elements along Z processed
- *                                               per workitem(in bytes)
- * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source
- *                                               tensor
- * @param[out] out_ptr                           Pointer to the destination tensor.
- *                                               Supported data types: QASYMM8
- * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension
- *                                               (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed
- *                                               per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension
- *                                               (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed
- *                                               per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension
- *                                               (in bytes)
- * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed
- *                                               per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination
- *                                               tensor
- */
-__kernel void arithmetic_add_qasymm8(TENSOR3D_DECLARATION(in1), TENSOR3D_DECLARATION(in2),
-                                     TENSOR3D_DECLARATION(out))
-{
-  // Get pixels pointer
-  Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
-  Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
-  Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
-
-  // Load data
-  VEC_DATA_TYPE(int, 16)
-  in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(int, 16));
-  VEC_DATA_TYPE(int, 16)
-  in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(int, 16));
-
-  // Get scaled value of two inputs
-  VEC_DATA_TYPE(int, 16) in1_val = in1_data + (VEC_DATA_TYPE(int, 16))(IN1_OFFSET);
-  VEC_DATA_TYPE(int, 16) in2_val = in2_data + (VEC_DATA_TYPE(int, 16))(IN2_OFFSET);
-
-  VEC_DATA_TYPE(int, 16)
-  left_shift = (VEC_DATA_TYPE(int, 16))1 << (VEC_DATA_TYPE(int, 16))(LEFT_SHIFT);
-  VEC_DATA_TYPE(int, 16) shifted_in1_val = in1_val * left_shift;
-  VEC_DATA_TYPE(int, 16) shifted_in2_val = in2_val * left_shift;
-
-  VEC_DATA_TYPE(int, 16)
-  scaled_in1_val =
-      ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(shifted_in1_val, IN1_MULT_INT, IN1_SHIFT, 16);
-  VEC_DATA_TYPE(int, 16)
-  scaled_in2_val =
-      ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(shifted_in2_val, IN2_MULT_INT, IN2_SHIFT, 16);
-
-  // Add inputs and multiply with a multiplier smaller than 1
-  VEC_DATA_TYPE(int, 16) sum_val = scaled_in1_val + scaled_in2_val;
-  VEC_DATA_TYPE(int, 16)
-  out_val =
-      ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(sum_val, RESULT_MULT_INT, RESULT_SHIFT, 16);
-  out_val += (VEC_DATA_TYPE(int, 16))(RESULT_OFFSET);
-
-  VEC_DATA_TYPE(uchar, 16) res = CONVERT(out_val, VEC_DATA_TYPE(uchar, 16));
-
-  // TODO: Apply min-max BOUND to support fuse with relu.
-  /*
-  #if defined(MIN_BOUND)
-      res = max(res, (uchar16)MIN_BOUND);
-  #endif // defined(MIN_BOUND)
-  #if defined(MAX_BOUND)
-      res = min(res, (uchar16)MAX_BOUND);
-  #endif // defined(MAX_BOUND)
-  */
-
-  // Store result
-  VSTORE(16)(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), 0, (__global DATA_TYPE_OUT *)out.ptr);
-}
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl
deleted file mode 100644
index 4147a00..0000000
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl
+++ /dev/null
@@ -1,233 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "helpers.h"
-
-#ifndef SCALE
-#define SCALE 1.0f
-#endif
-#ifndef OFFSET
-#define OFFSET 0
-#endif
-#ifndef VEC_SIZE
-#define VEC_SIZE 1
-#endif
-
-#if defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT)
-/** Perform a cast operation on an input tensor.
- *
- * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and
- *            -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
- *            -DVEC_SIZE=16
- * @attention -DBOOL_INPUT : Whether type of input is bool.
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data
- *                                                  types: F16/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in
- *                                                  bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X
- *                                                  processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in
- *                                                  bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y
- *                                                  processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in
- *                                                  bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z
- *                                                  processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source
- *                                                  image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data
- *                                                  types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension
- *                                                  (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension
- *                                                  (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in
- *                                                  bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
- *                                                  destination image
- */
-__kernel void cast(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output))
-{
-  Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
-  Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-  VSTORE(VEC_SIZE)
-  (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr),
-           VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)),
-   0, (__global DATA_TYPE_OUT *)output.ptr);
-  VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)
-  res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr),
-                VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE));
-#if defined(BOOL_INPUT)
-  VEC_DATA_TYPE(char, VEC_SIZE) tmp = CONVERT(res, VEC_DATA_TYPE(char, VEC_SIZE));
-  VEC_DATA_TYPE(char, VEC_SIZE) mask = (VEC_DATA_TYPE(char, VEC_SIZE))(1);
-  res = CONVERT(tmp & mask, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE));
-#endif // defined(BOOL_INPUT)
-
-  VSTORE(VEC_SIZE)(res, 0, (__global DATA_TYPE_OUT *)output.ptr);
-}
-
-/** Perform a cast operation on an QASYMM8 input tensor.
- * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and
- *            -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int
- * @attention Offset and Scale of input should be given as a preprocessor argument using
- *            -DOFFSET=int, -DSCALE=float. e.g. -DOFFSET=1, -DSCALE=0.5
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
- *            -DVEC_SIZE=16
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data
- *                                                  types: F16/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in
- *                                                  bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X
- *                                                  processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in
- *                                                  bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y
- *                                                  processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in
- *                                                  bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z
- *                                                  processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source
- *                                                  image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data
- *                                                  types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension
- *                                                  (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension
- *                                                  (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in
- *                                                  bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
- *                                                  destination image
- */
-__kernel void cast_qasymm_in(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output))
-{
-  Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
-  Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-  VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)
-  in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr);
-  VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET);
-  VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE);
-
-  VEC_DATA_TYPE(int, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(int, VEC_SIZE)) - offset;
-  VEC_DATA_TYPE(float, VEC_SIZE) out_data = CONVERT(tmp, VEC_DATA_TYPE(float, VEC_SIZE)) * scale;
-
-  VSTORE(VEC_SIZE)
-  (CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0,
-   (__global DATA_TYPE_OUT *)output.ptr);
-}
-
-/** Perform a cast operation on an QASYMM8 output tensor.
- * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and
- *            -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int
- * @attention Offset and Scale of output should be given as a preprocessor argument using
- *            -DOFFSET=int, -DSCALE=float. e.g. -DOFFSET=1, -DSCALE=0.5
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
- *            -DVEC_SIZE=16
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data
- *                                                  types: F16/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in
- *                                                 bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X
- *                                                  processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in
- *                                                  bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y
- *                                                  processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in
- *                                                  bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z
- *                                                  processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source
- *                                                  image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data
- *                                                  types: U8
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension
- *                                                  (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension
- *                                                  (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in
- *                                                  bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
- *                                                  destination image
- */
-__kernel void cast_qasymm_out(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output))
-{
-  Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
-  Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-  VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)
-  in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr);
-  VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET);
-  VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE);
-
-  VEC_DATA_TYPE(float, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(float, VEC_SIZE)) / scale;
-  VEC_DATA_TYPE(float, VEC_SIZE) out_data = tmp + CONVERT(offset, VEC_DATA_TYPE(float, VEC_SIZE));
-
-  VSTORE(VEC_SIZE)
-  (CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0,
-   (__global DATA_TYPE_OUT *)output.ptr);
-}
-#endif // defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl
deleted file mode 100644
index 0285c95..0000000
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT)
-/** Perform space to depth rearrangement of tensor
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size.
- *            e.g. -DDEPTH_OUT=16
- * @attention The value of the z-axis of output tensor should be given as a preprocessor argument
- *            using -DZ_OUT=size. e.g. -DZ_OUT=16
- * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g.
- *            -DBLOCK_SIZE=1
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data
- *                                                  types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in
- *                                                  bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X
- *                                                  processed per workitem(in  bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in
- *                                                  bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y
- *                                                  processed per workitem(in  bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in
- *                                                  bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z
- *                                                  processed per workitem(in  bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source
- *                                                  image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data
- *                                                  types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension
- *                                                  (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension
- *                                                  (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in
- *                                                  bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in
- *                                                  bytes)
- * @param[in]  output_step_w                        output_stride_w * number of elements along W
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
- *                                                  destination image
- */
-__kernel void depth_to_space_nchw(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output))
-{
-  Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
-  Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, Z_OUT);
-
-  int out_index[4] = {0};
-  int in_index[4] = {0};
-
-  out_index[0] = get_global_id(0);         // W
-  out_index[1] = get_global_id(1);         // H
-  out_index[2] = get_global_id(2) % Z_OUT; // C
-  out_index[3] = get_global_id(2) / Z_OUT; // B
-
-  in_index[0] = out_index[0] / BLOCK_SIZE;
-  in_index[1] = out_index[1] / BLOCK_SIZE;
-  in_index[2] = out_index[2] +
-                ((out_index[1] % BLOCK_SIZE) * BLOCK_SIZE + out_index[0] % BLOCK_SIZE) * DEPTH_OUT;
-  in_index[3] = out_index[3];
-
-  *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(
-      &in, in_index[0], in_index[1], in_index[2], in_index[3]));
-}
-#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT)
-
-#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT)
-/** Perform space to depth rearrangement of tensor (NHWC)
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size.
- *            e.g. -DDEPTH_OUT=16
- * @attention The value of the z-axis of output tensor should be given as a preprocessor argument
- *            using -DZ_OUT=size. e.g. -DZ_OUT=16
- * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g.
- *            -DBLOCK_SIZE=1
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data
- *                                                  types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in
- *                                                  bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X
- *                                                  processed per workitem(in  bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in
- *                                                  bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y
- *                                                  processed per workitem(in  bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in
- *                                                  bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z
- *                                                  processed per workitem(in  bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source
- *                                                  image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data
- *                                                  types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension
- *                                                  (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension
- *                                                  (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in
- *                                                  bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in
- *                                                  bytes)
- * @param[in]  output_step_w                        output_stride_w * number of elements along W
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
- *                                                  destination image
- */
-__kernel void depth_to_space_nhwc(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output))
-{
-  Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
-  Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, Z_OUT);
-
-  int out_index[4] = {0};
-  int in_index[4] = {0};
-
-  out_index[0] = get_global_id(0);         // C
-  out_index[1] = get_global_id(1);         // W
-  out_index[2] = get_global_id(2) % Z_OUT; // H
-  out_index[3] = get_global_id(2) / Z_OUT; // B
-
-  in_index[0] = out_index[0] +
-                ((out_index[2] % BLOCK_SIZE) * BLOCK_SIZE + out_index[1] % BLOCK_SIZE) * DEPTH_OUT;
-  in_index[1] = out_index[1] / BLOCK_SIZE;
-  in_index[2] = out_index[2] / BLOCK_SIZE;
-  in_index[3] = out_index[3];
-
-  *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(
-      &in, in_index[0], in_index[1], in_index[2], in_index[3]));
-}
-#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h
index 2d0b6a2..e07a25e 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h
@@ -15,7 +15,7 @@
  */
 
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,7 +37,6 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-
 #ifndef ARM_COMPUTE_HELPER_H
 #define ARM_COMPUTE_HELPER_H
 
@@ -59,16 +58,219 @@
 #pragma OPENCL EXTENSION cl_arm_printf : enable
 #endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
 
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
 #define EXPAND(x) x
 
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
 #define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
 
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each
+ * element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
 #define VLOAD_STR(size) vload##size
 #define VLOAD(size) VLOAD_STR(size)
 
 #define VSTORE_STR(size) vstore##size
 #define VSTORE(size) VSTORE_STR(size)
 
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create
+// defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
 #define VEC_DATA_TYPE_STR(type, size) type##size
 #define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
 
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h
index a83b1a8..5f1b3f9 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h
@@ -15,7 +15,7 @@
  */
 
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,29 +37,112 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-
 #ifndef ARM_COMPUTE_HELPERS_ASYMM_H
 #define ARM_COMPUTE_HELPERS_ASYMM_H
 
 #include "helpers.h"
 
+/** Convert the given vector with round to nearest even rounding mode
+ *
+ * @param[in] x    The target to be converted
+ * @param[in] type The target type
+ *
+ * @return The converted vector
+ */
+#define CONVERT_DOWN_RTE_STR(x, type) (convert_##type##_rte((x)))
+#define CONVERT_DOWN_RTE(x, type) CONVERT_DOWN_RTE_STR(x, type)
+
+/** Quantize a floating-point scalar value to 8-bit asymmetric
+ *
+ * @param[in] input  Input value to quantize
+ * @param[in] offset Quantization offset
+ * @param[in] scale  Quantization scale
+ *
+ * @return quantized value
+ */
+inline uchar quantize_qasymm8(float input, float offset, float scale)
+{
+  float out_f32 = input / scale + offset;
+  uchar res_u8 = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, int), uchar);
+  return res_u8;
+}
+
+/** Dequantize a scalar value from 8-bit asymmetric to floating-point
+ *
+ * @param[in] input  Input value to quantize
+ * @param[in] offset Quantization offset
+ * @param[in] scale  Quantization scale
+ *
+ * @return quantized value
+ */
+inline float dequantize_qasymm8(uchar input, float offset, float scale)
+{
+  return ((float)input - offset) * scale;
+}
+
+/** Dequantize a scalar value from signed 8-bit asymmetric to floating-point
+ *
+ * @param[in] input  Input value to quantize
+ * @param[in] offset Quantization offset
+ * @param[in] scale  Quantization scale
+ *
+ * @return quantized value
+ */
+inline float dequantize_qasymm8_signed(char input, float offset, float scale)
+{
+  return ((float)input - offset) * scale;
+}
+
+/** Quantize a vector of values from floating-point
+ *
+ * @param[in] type Output data type.
+ * @param[in] size Size of vector.
+ *
+ * @return quantized values
+ */
+#define QUANTIZE_IMPL(type, size)                                                                 \
+  inline VEC_DATA_TYPE(type, size)                                                                \
+      quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale)          \
+  {                                                                                               \
+    VEC_DATA_TYPE(float, size)                                                                    \
+    out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset); \
+    VEC_DATA_TYPE(type, size)                                                                     \
+    res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)),                        \
+                      VEC_DATA_TYPE(type, size));                                                 \
+    return res;                                                                                   \
+  }
+
+/** Dequantize a vector of values to floating-point
+ *
+ * @param[in] type Input data type.
+ * @param[in] size Size of vector.
+ *
+ * @return dequantized values in floating point
+ */
+#define DEQUANTIZE_IMPL(type, size)                                                       \
+  inline VEC_DATA_TYPE(float, size)                                                       \
+      dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \
+  {                                                                                       \
+    return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale;                 \
+  }
+
 /** Correctly-rounded-to-nearest division by a power-of-two.
  *
  * @param[in] size Size of vector.
  *
  * @return Correctly-rounded-to-nearest division by a power-of-two.
  */
-#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size)                                     \
-  inline VEC_DATA_TYPE(int, size)                                                    \
-      asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, int exponent) \
-  {                                                                                  \
-    VEC_DATA_TYPE(int, size)                                                         \
-    mask = (1 << exponent) - 1;                                                      \
-    const VEC_DATA_TYPE(int, size) zero = 0;                                         \
-    const VEC_DATA_TYPE(int, size) one = 1;                                          \
-    VEC_DATA_TYPE(int, size)                                                         \
-    threshold = (mask >> 1) + select(zero, one, x < 0);                              \
-    return (x >> exponent) + select(zero, one, (x & mask) > threshold);              \
+#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size)                        \
+  inline VEC_DATA_TYPE(int, size) asymm_rounding_divide_by_POW2_##size( \
+      VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent)    \
+  {                                                                     \
+    const VEC_DATA_TYPE(int, size) zero = (VEC_DATA_TYPE(int, size))0;  \
+    const VEC_DATA_TYPE(int, size) one = (VEC_DATA_TYPE(int, size))1;   \
+    VEC_DATA_TYPE(int, size)                                            \
+    mask = (one << exponent) - one;                                     \
+    VEC_DATA_TYPE(int, size)                                            \
+    threshold = (mask >> 1) + select(zero, one, x < 0);                 \
+    return (x >> exponent) + select(zero, one, (x & mask) > threshold); \
   }
 
 /** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1),
@@ -81,9 +164,19 @@
     b_64 = convert_long##size(b);                                              \
     VEC_DATA_TYPE(long, size)                                                  \
     ab_64 = a_64 * b_64;                                                       \
-    /* COMPMID-907 */                                                          \
+    /* Revert COMPMID-907 */                                                   \
+    VEC_DATA_TYPE(long, size)                                                  \
+    mask1 = 1 << 30;                                                           \
+    VEC_DATA_TYPE(long, size)                                                  \
+    mask2 = 1 - (1 << 30);                                                     \
+    VEC_DATA_TYPE(long, size)                                                  \
+    is_positive_or_zero = ab_64 >= 0;                                          \
+    VEC_DATA_TYPE(long, size)                                                  \
+    nudge = select(mask2, mask1, is_positive_or_zero);                         \
+    VEC_DATA_TYPE(long, size)                                                  \
+    mask = 1ll << 31;                                                          \
     VEC_DATA_TYPE(int, size)                                                   \
-    ab_x2_high32 = convert_int##size(((ab_64 + (1 << 30)) >> 31));             \
+    ab_x2_high32 = convert_int##size((ab_64 + nudge) / mask);                  \
     return select(ab_x2_high32, INT_MAX, overflow);                            \
   }
 
@@ -335,9 +428,18 @@
     return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size);                         \
   }
 
+#define QUANTIZE_STR(input, offset, scale, type, size) quantize_##type##size(input, offset, scale)
+#define QUANTIZE(input, offset, scale, type, size) QUANTIZE_STR(input, offset, scale, type, size)
+#define DEQUANTIZE_STR(input, offset, scale, type, size) \
+  dequantize_##type##size(input, offset, scale)
+#define DEQUANTIZE(input, offset, scale, type, size) \
+  DEQUANTIZE_STR(input, offset, scale, type, size)
+
 #define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) \
   asymm_rounding_divide_by_POW2_##size(x, exponent)
 #define ASYMM_MULT(a, b, size) asymm_mult##size(a, b)
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size) \
+  ASYMM_MULT(x *((VEC_DATA_TYPE(int, size))(1) << (-left_shift)), quantized_multiplier, size)
 #define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \
   ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size)
 #define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) \
@@ -360,11 +462,53 @@
 #define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) \
   asymm_rescale##size(value, src_integer_bits, dst_integer_bits)
 
+#define MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(size)                                               \
+  inline VEC_DATA_TYPE(int, size)                                                                 \
+      multiply_by_quantized_multiplier##size(VEC_DATA_TYPE(int, size) input, int qmul, int shift) \
+  {                                                                                               \
+    const int left_shift = shift > 0 ? shift : 0;                                                 \
+    const int right_shift = shift > 0 ? 0 : -shift;                                               \
+    return ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(input * (1 << left_shift), qmul, size),       \
+                                         right_shift, size);                                      \
+  }
+#define MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, size) \
+  multiply_by_quantized_multiplier##size(input, qmul, shift)
+
+QUANTIZE_IMPL(uchar, 1)
+QUANTIZE_IMPL(char, 1)
+QUANTIZE_IMPL(uint, 1)
+QUANTIZE_IMPL(int, 1)
+QUANTIZE_IMPL(uchar, 4)
+QUANTIZE_IMPL(ushort, 4)
+QUANTIZE_IMPL(short, 4)
+QUANTIZE_IMPL(uchar, 16)
+QUANTIZE_IMPL(char, 16)
+QUANTIZE_IMPL(ushort, 16)
+QUANTIZE_IMPL(short, 16)
+QUANTIZE_IMPL(uint, 16)
+QUANTIZE_IMPL(int, 16)
+
+DEQUANTIZE_IMPL(uchar, 1)
+DEQUANTIZE_IMPL(char, 1)
+DEQUANTIZE_IMPL(uint, 1)
+DEQUANTIZE_IMPL(int, 1)
+DEQUANTIZE_IMPL(uchar, 4)
+DEQUANTIZE_IMPL(ushort, 4)
+DEQUANTIZE_IMPL(short, 4)
+DEQUANTIZE_IMPL(uchar, 16)
+DEQUANTIZE_IMPL(char, 16)
+DEQUANTIZE_IMPL(ushort, 16)
+DEQUANTIZE_IMPL(short, 16)
+DEQUANTIZE_IMPL(uint, 16)
+DEQUANTIZE_IMPL(int, 16)
+
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(1)
 ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(2)
 ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(4)
 ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(8)
 ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(16)
 
+ASYMM_MULT_IMPL(1)
 ASYMM_MULT_IMPL(2)
 ASYMM_MULT_IMPL(4)
 ASYMM_MULT_IMPL(8)
@@ -375,16 +519,19 @@ ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(4)
 ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(8)
 ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(16)
 
+ASYMM_SELECT_USING_MASK_IMPL(1)
 ASYMM_SELECT_USING_MASK_IMPL(2)
 ASYMM_SELECT_USING_MASK_IMPL(4)
 ASYMM_SELECT_USING_MASK_IMPL(8)
 ASYMM_SELECT_USING_MASK_IMPL(16)
 
+ASYMM_MASK_IF_ZERO_IMPL(1)
 ASYMM_MASK_IF_ZERO_IMPL(2)
 ASYMM_MASK_IF_ZERO_IMPL(4)
 ASYMM_MASK_IF_ZERO_IMPL(8)
 ASYMM_MASK_IF_ZERO_IMPL(16)
 
+ASYMM_MASK_IF_NON_ZERO_IMPL(1)
 ASYMM_MASK_IF_NON_ZERO_IMPL(2)
 ASYMM_MASK_IF_NON_ZERO_IMPL(4)
 ASYMM_MASK_IF_NON_ZERO_IMPL(8)
@@ -400,6 +547,7 @@ ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(4)
 ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(8)
 ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(16)
 
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(1)
 ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(2)
 ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(4)
 ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(8)
@@ -415,9 +563,16 @@ ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(4)
 ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(8)
 ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(16)
 
+ASYMM_RESCALE_IMPL(1)
 ASYMM_RESCALE_IMPL(2)
 ASYMM_RESCALE_IMPL(4)
 ASYMM_RESCALE_IMPL(8)
 ASYMM_RESCALE_IMPL(16)
 
+MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(1)
+MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(2)
+MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(4)
+MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(8)
+MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(16)
+
 #endif // ARM_COMPUTE_HELPERS_ASYMM_H
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl
deleted file mode 100644
index 12c8eeb..0000000
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "helpers.h"
-
-#ifndef VEC_SIZE
-#define VEC_SIZE 1
-#endif
-
-#if defined(DATA_TYPE)
-/** Returns result of prelu function implemented as below:
- * f(input) = alpha * input for input < 0, f(input) = input for input >= 0.
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
- *            -DVEC_SIZE=16
- * @note Can only take floating point data types.
- *
- * @param[in]  input1_ptr                            Pointer to the source image. Supported Data
- *                                                   types : F16/F32
- * @param[in]  input1_stride_x                       Stride of the source image in X dimension (in
- *                                                   bytes)
- * @param[in]  input1_step_x                         input1_stride_x * number of elements along X
- *                                                   processed per workitem(in bytes)
- * @param[in]  input1_stride_y                       Stride of the source image in Y dimension (in
- *                                                   bytes)
- * @param[in]  input1_step_y                         input1_stride_y * number of elements along Y
- *                                                   processed per workitem(in bytes)
- * @param[in]  input1_stride_z                       Stride of the source tensor in Z dimension (in
- *                                                   bytes)
- * @param[in]  input1_step_z                         input1_stride_z * number of elements along Z
- *                                                   processed per workitem(in bytes)
- * @param[in]  input1_offset_first_element_in_bytes  The offset of the first element in the source
- *                                                   image
- * @param[in]  alpha_ptr                             Pointer to the source image. Supported Data
- *                                                   types : F16/F32
- * @param[in]  alpha_stride_x                        Stride of the source image in X dimension (in
- *                                                   bytes)
- * @param[in]  alpha_step_x                          input2_stride_x * number of elements along X
- *                                                   processed per workitem(in bytes)
- * @param[in]  alpha_stride_y                        Stride of the source image in Y dimension (in
- *                                                   bytes)
- * @param[in]  alpha_step_y                          input2_stride_y * number of elements along Y
- *                                                   processed per workitem(in bytes)
- * @param[in]  alpha_stride_z                        Stride of the source tensor in Z dimension (in
- *                                                   bytes)
- * @param[in]  alpha_step_z                          input2_stride_z * number of elements along Z
- *                                                   processed per workitem(in bytes)
- * @param[in]  alpha_offset_first_element_in_bytes   The offset of the first element in the source
- *                                                   image
- *
- * @param[out] output_ptr                            Pointer to the destination image. Supported
- *                                                   data types: same as @p input_ptr
- * @param[in]  output_stride_x                       Stride of the destination image in X dimension
- *                                                   (in bytes)
- * @param[in]  output_step_x                         output_stride_x * number of elements along X
- *                                                   processed per workitem(in bytes)
- * @param[in]  output_stride_y                       Stride of the destination image in Y dimension
- *                                                   (in bytes)
- * @param[in]  output_step_y                         output_stride_y * number of elements along Y
- *                                                   processed per workitem(in bytes)
- * @param[in]  output_stride_z                       Stride of the source tensor in Z dimension (in
- *                                                   bytes)
- * @param[in]  output_step_z                         output_stride_z * number of elements along Z
- *                                                   processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the
- *                                                   destination image
- */
-__kernel void prelu(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(alpha),
-                    TENSOR3D_DECLARATION(output))
-{
-  Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
-  Tensor3D alpha = CONVERT_TO_TENSOR3D_STRUCT(alpha);
-  Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-  VSTORE(VEC_SIZE)
-  (VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr) < 0
-       ? VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr) *
-             VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)alpha.ptr)
-       : VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr),
-   0, (__global DATA_TYPE *)output.ptr);
-}
-#endif // defined(DATA_TYPE)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl
deleted file mode 100644
index a66e107..0000000
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "helpers.h"
-#define SUB(x, y) (x) - (y)
-
-#if defined(OFF_IN) && defined(OFF_ALPHA) && defined(OFF_OUT) && defined(SCALE_IN) && \
-    defined(SCALE_ALPHA) && defined(SCALE_OUT) && defined(VEC_SIZE)
-
-#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
-#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
-#define VEC_UCHAR VEC_DATA_TYPE(uchar, VEC_SIZE)
-#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
-#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
-#define SELECT_TYPE VEC_INT
-
-/** Returns result of prelu function implemented as below:
- *  f(input) = alpha * input for input < 0, f(input) = input for input >= 0.
- *
- * @attention Data type can be passed using the -DDATA_TYPE_IN compile flag, e.g.
- *            -DDATA_TYPE_IN=uchar
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
- *            -DVEC_SIZE=16
- * @note Can only take uchar data types.
- *
- * @param[in]  input1_ptr                            Pointer to the source image. Supported Data
- *                                                   types : QASYMM8
- * @param[in]  input1_stride_x                       Stride of the source image in X dimension (in
- *                                                   bytes)
- * @param[in]  input1_step_x                         input1_stride_x * number of elements along X
- *                                                   processed per workitem(in bytes)
- * @param[in]  input1_stride_y                       Stride of the source image in Y dimension (in
- *                                                   bytes)
- * @param[in]  input1_step_y                         input1_stride_y * number of elements along Y
- *                                                   processed per workitem(in bytes)
- * @param[in]  input1_stride_z                       Stride of the source tensor in Z dimension (in
- *                                                   bytes)
- * @param[in]  input1_step_z                         input1_stride_z * number of elements along Z
- *                                                   processed per workitem(in bytes)
- * @param[in]  input1_offset_first_element_in_bytes  The offset of the first element in the source
- *                                                   image
- * @param[in]  alpha_ptr                             Pointer to the source image. Supported Data
- *                                                   types : QASYMM8
- * @param[in]  alpha_stride_x                        Stride of the source image in X dimension (in
- *                                                   bytes)
- * @param[in]  alpha_step_x                          input2_stride_x * number of elements along X
- *                                                   processed per workitem(in bytes)
- * @param[in]  alpha_stride_y                        Stride of the source image in Y dimension (in
- *                                                   bytes)
- * @param[in]  alpha_step_y                          input2_stride_y * number of elements along Y
- *                                                   processed per workitem(in bytes)
- * @param[in]  alpha_stride_z                        Stride of the source tensor in Z dimension (in
- *                                                   bytes)
- * @param[in]  alpha_step_z                          input2_stride_z * number of elements along Z
- *                                                   processed per workitem(in bytes)
- * @param[in]  alpha_offset_first_element_in_bytes   The offset of the first element in the source
- *                                                   image
- * @param[out] output_ptr                            Pointer to the destination image. Supported
- *                                                   data types: same as @p input_ptr
- * @param[in]  output_stride_x                       Stride of the destination image in X dimension
- *                                                   (in bytes)
- * @param[in]  output_step_x                         output_stride_x * number of elements along X
- *                                                   processed per workitem(in bytes)
- * @param[in]  output_stride_y                       Stride of the destination image in Y dimension
- *                                                   (in bytes)
- * @param[in]  output_step_y                         output_stride_y * number of elements along Y
- *                                                   processed per workitem(in bytes)
- * @param[in]  output_stride_z                       Stride of the source tensor in Z dimension (in
- *                                                   bytes)
- * @param[in]  output_step_z                         output_stride_z * number of elements along Z
- *                                                   processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the
- *                                                   destination image
- */
-__kernel void prelu_qasymm8(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(alpha),
-                            TENSOR3D_DECLARATION(output))
-{
-  // Get pixels pointer
-  Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
-  Tensor3D alpha = CONVERT_TO_TENSOR3D_STRUCT(alpha);
-  Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-  VEC_INT in_vec = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)input.ptr), VEC_INT);
-  VEC_INT alpha_vec = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)alpha.ptr), VEC_INT);
-
-  in_vec = SUB(in_vec, (VEC_INT)((int)OFF_IN));
-  alpha_vec = SUB(alpha_vec, (VEC_INT)((int)OFF_ALPHA));
-
-  const VEC_FLOAT inf32 = CONVERT(in_vec, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN);
-  const VEC_FLOAT alphaf32 = CONVERT(alpha_vec, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_ALPHA);
-  const VEC_FLOAT outf32 =
-      select(inf32, inf32 * alphaf32, CONVERT(inf32 < (VEC_FLOAT)0, SELECT_TYPE));
-  const VEC_FLOAT qresf32 = outf32 / ((VEC_FLOAT)(float)SCALE_OUT) + ((VEC_FLOAT)((float)OFF_OUT));
-  const VEC_UCHAR res = CONVERT_SAT(CONVERT_DOWN(qresf32, VEC_INT), VEC_UCHAR);
-
-  VSTORE(VEC_SIZE)
-  (res, 0, (__global uchar *)output.ptr);
-}
-
-#endif // defined(OFF_IN) && defined(OFF_ALPHA) && defined(OFF_OUT) && defined(SCALE_IN) &&
-       // defined(SCALE_ALPHA) && defined(SCALE_OUT) && defined(VEC_SIZE)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl
deleted file mode 100644
index eb612f8..0000000
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE) && defined(Z_IN)
-/** Perform space to depth rearrangement of tensor
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size.
- *            e.g. -DDEPTH_IN=16
- * @attention The value of the z-axis of input tensor depth should be given as a preprocessor
- *            argument using -DZ_IN=size. e.g. -DZ_IN=16
- * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g.
- *            -DBLOCK_SIZE=1
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data
- *                                                  types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in
- *                                                  bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X
- *                                                  processed per workitem(in  bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in
- *                                                  bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y
- *                                                  processed per workitem(in  bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in
- *                                                  bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z
- *                                                  processed per workitem(in  bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source
- *                                                  image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data
- *                                                  types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension
- *                                                  (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension
- *                                                  (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in
- *                                                  bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in
- *                                                  bytes)
- * @param[in]  output_step_w                        output_stride_w * number of elements along W
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
- *                                                  destination image
- */
-__kernel void space_to_depth_nchw(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output))
-{
-  Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, Z_IN);
-  Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
-
-  int out_index[4] = {0};
-  int in_index[4] = {0};
-
-  in_index[0] = get_global_id(0);        // W
-  in_index[1] = get_global_id(1);        // H
-  in_index[2] = get_global_id(2) % Z_IN; // C
-  in_index[3] = get_global_id(2) / Z_IN; // B
-
-  out_index[0] = in_index[0] / BLOCK_SIZE;
-  out_index[1] = in_index[1] / BLOCK_SIZE;
-  out_index[2] =
-      in_index[2] + ((in_index[1] % BLOCK_SIZE) * BLOCK_SIZE + in_index[0] % BLOCK_SIZE) * DEPTH_IN;
-  out_index[3] = in_index[3];
-
-  *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0], out_index[1], out_index[2],
-                                          out_index[3])) = *((__global DATA_TYPE *)in.ptr);
-}
-#endif // defined(DATA_TYPE) && defined(Z_IN) && defined(BLOCK_SIZE) && defined(Z_IN)
-
-#if defined(DATA_TYPE) && defined(Z_IN) && defined(BLOCK_SIZE) && defined(Z_IN)
-/** Perform space to depth rearrangement of tensor
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size.
- *            e.g. -DDEPTH_IN=16
- * @attention The value of the z-axis of input tensor depth should be given as a preprocessor
- *            argument using -DZ_IN=size. e.g. -DZ_IN=16
- * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g.
- *            -DBLOCK_SIZE=1
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data
- *                                                  types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in
- *                                                  bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X
- *                                                  processed per workitem(in  bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in
- *                                                  bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y
- *                                                  processed per workitem(in  bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in
- *                                                  bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z
- *                                                  processed per workitem(in  bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source
- *                                                  image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data
- *                                                  types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension
- *                                                  (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension
- *                                                  (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in
- *                                                  bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in
- *                                                  bytes)
- * @param[in]  output_step_w                        output_stride_w * number of elements along W
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
- *                                                  destination image
- */
-__kernel void space_to_depth_nhwc(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output))
-{
-  Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, Z_IN);
-  Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
-
-  int out_index[4] = {0};
-  int in_index[4] = {0};
-
-  in_index[0] = get_global_id(0);        // C
-  in_index[1] = get_global_id(1);        // W
-  in_index[2] = get_global_id(2) % Z_IN; // H
-  in_index[3] = get_global_id(2) / Z_IN; // B
-
-  out_index[0] =
-      in_index[0] + ((in_index[2] % BLOCK_SIZE) * BLOCK_SIZE + in_index[1] % BLOCK_SIZE) * DEPTH_IN;
-  out_index[1] = in_index[1] / BLOCK_SIZE;
-  out_index[2] = in_index[2] / BLOCK_SIZE;
-  out_index[3] = in_index[3];
-
-  *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0], out_index[1], out_index[2],
-                                          out_index[3])) = *((__global DATA_TYPE *)in.ptr);
-}
-#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE) && defined(Z_IN)
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLArgOperationKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLArgOperationKernel.cpp
deleted file mode 100644
index 06eeb5b..0000000
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLArgOperationKernel.cpp
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/CL/kernels/CLArgOperationKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-const TensorShape inferOutputShape(const TensorShape &input_shape, const uint32_t axis)
-{
-  TensorShape out_shape{input_shape};
-
-  out_shape.set(axis, 1);
-
-  return out_shape;
-}
-} // namespace
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis,
-                          ArgOperation /*op*/)
-{
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::S32, DataType::F32, DataType::U8,
-                                        DataType::QASYMM8);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::S32);
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape().num_dimensions() - 1) !=
-                                      output->tensor_shape().num_dimensions(),
-                                  "Input's rank is not same with output");
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0,
-                                  "Inputs are not broadcast compatible");
-
-  const TensorShape output_shape = inferOutputShape(input->tensor_shape(), axis);
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_shape.total_size() != output->tensor_shape().total_size(),
-                                  "output shape's size does not match axis");
-
-  const auto num_dimensions = input->tensor_shape().num_dimensions();
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= num_dimensions, "axis must be less than (input's rank).");
-  return Status{};
-}
-
-} // namespace
-
-CLArgOperationKernel::CLArgOperationKernel() : _input(nullptr), _output(nullptr), _axis() {}
-
-void CLArgOperationKernel::configure(const ICLTensor *input, ICLTensor *output, const uint32_t axis,
-                                     ArgOperation op)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op));
-
-  _input = input;
-  _output = output;
-  _axis = axis;
-
-  std::unique_ptr<ITensorInfo> output_info = output->info()->clone();
-  output_info->set_tensor_shape(inferOutputShape(input->info()->tensor_shape(), axis));
-
-  // Construct kernel and set op_code based on type of ArgOperation as specified by object op
-  std::string kernel_name = "arg_op";
-  int op_code = 0;
-  if (op == ArgOperation::MAX)
-  {
-    op_code = 1;
-  }
-  else if (op == ArgOperation::MIN)
-  {
-    op_code = 2;
-  }
-  else
-    throw std::runtime_error("Operation not supported, yet");
-
-  // Set kernel build options
-  std::set<std::string> build_opts;
-  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output_info->dimension(2)));
-  build_opts.emplace("-DOP_CODE=" + support::cpp11::to_string(op_code));
-
-  // Create kernel
-  _kernel =
-      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
-
-  // Configure  kernel window
-  Window win = calculate_max_window(*output_info, Steps());
-
-  Coordinates coord;
-  coord.set_num_dimensions(output_info->num_dimensions());
-  output->info()->set_valid_region(ValidRegion(coord, output_info->tensor_shape()));
-
-  ICLKernel::configure_internal(win);
-}
-
-Status CLArgOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                      const uint32_t axis, ArgOperation op)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
-
-  return Status{};
-}
-
-void CLArgOperationKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-  const TensorShape &shape_in = _input->info()->tensor_shape();
-
-  unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters
-
-  _kernel.setArg<cl_int>(idx++, _axis);
-  _kernel.setArg<cl_int>(idx++, shape_in[_axis]);
-
-  Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
-
-  // Setup input slice
-  Window slice_in(slice_out);
-  slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-  slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-  slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-  slice_in.set(3, Window::Dimension(0, 0, 0));
-
-  // Copy output's shape in order to use for recovering at end of this method
-  const TensorShape shape_out = _output->info()->tensor_shape();
-  _output->info()->set_tensor_shape(inferOutputShape(shape_in, _axis));
-
-  do
-  {
-    unsigned int idx = 0;
-    add_4D_tensor_argument(idx, _input, slice_in);
-    add_4D_tensor_argument(idx, _output, slice_out);
-    enqueue(queue, *this, slice_out);
-  } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
-
-  // Recover output's shape of output tensor
-  _output->info()->set_tensor_shape(shape_out);
-}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
index bb55568..fbc76f5 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
@@ -43,6 +43,7 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibraryEx.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "support/StringSupport.h"
 
 using namespace arm_compute;
 
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp
deleted file mode 100644
index 01ea655..0000000
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/CL/kernels/CLCastKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-CLCastKernel::CLCastKernel() : _input(nullptr), _output(nullptr) {}
-
-void CLCastKernel::configure(const ICLTensor *input, ICLTensor *output, SubDataType input_subtype)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
-                                                DataType::S16, DataType::S32, DataType::F16,
-                                                DataType::F32);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
-                                                DataType::S16, DataType::S32, DataType::F16,
-                                                DataType::F32);
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
-
-  _input = input;
-  _output = output;
-
-  constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-  // Set kernel build options
-  CLBuildOptions build_opts;
-  build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type()));
-  build_opts.add_option("-DDATA_TYPE_OUT=" +
-                        get_cl_type_from_data_type(output->info()->data_type()));
-  build_opts.add_option(
-      ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
-
-  // Create kernel
-  if (is_data_type_quantized_asymmetric(input->info()->data_type()))
-  {
-    UniformQuantizationInfo qinfo = input->info()->quantization_info().uniform();
-    const float scale_in = qinfo.scale;
-    const int offset_in = qinfo.offset;
-    build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(scale_in));
-    build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(offset_in));
-
-    _kernel = static_cast<cl::Kernel>(
-        CLKernelLibraryEx::get().create_kernel("cast_qasymm_in", build_opts.options()));
-  }
-  else if (is_data_type_quantized_asymmetric(output->info()->data_type()))
-  {
-    UniformQuantizationInfo qinfo = output->info()->quantization_info().uniform();
-    const float scale_in = qinfo.scale;
-    const float offset_in = qinfo.offset;
-
-    build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(scale_in));
-    build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(offset_in));
-
-    _kernel = static_cast<cl::Kernel>(
-        CLKernelLibraryEx::get().create_kernel("cast_qasymm_out", build_opts.options()));
-  }
-  else
-  {
-    build_opts.add_option_if(input_subtype == SubDataType::BOOL, "-DBOOL_INPUT");
-    _kernel = static_cast<cl::Kernel>(
-        CLKernelLibraryEx::get().create_kernel("cast", build_opts.options()));
-  }
-
-  // Configure kernel window
-  Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-  AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
-  AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-  update_window_and_padding(win, input_access, output_access);
-  output_access.set_valid_region(win, input->info()->valid_region());
-
-  ICLKernel::configure_internal(win);
-}
-
-void CLCastKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-  Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-  Window slice = collapsed.first_slice_window_3D();
-
-  do
-  {
-    unsigned int idx = 0;
-    add_3D_tensor_argument(idx, _input, slice);
-    add_3D_tensor_argument(idx, _output, slice);
-    enqueue(queue, *this, slice, lws_hint());
-  } while (collapsed.slide_window_slice_3D(slice));
-}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp
deleted file mode 100644
index 3891368..0000000
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-// TODO Use this validation function
-#if 0
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
-                          const int32_t block_size)
-{
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
-                                                DataType::S16, DataType::S32, DataType::F16,
-                                                DataType::F32);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
-                                                DataType::S16, DataType::S32, DataType::F16,
-                                                DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size < 1,
-                                  "Block size should be greater than or equal to 1.");
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) != input->dimension(0) * block_size,
-                                  "Output width should be equal to (Input width * block size)");
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(1) != input->dimension(1) * block_size,
-                                  "Output height should be equal to (Input height * block size)");
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) % (block_size * block_size) != 0,
-                                  "Input depth should be divisible by (block size * block size)");
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-      output->dimension(2) != input->dimension(2) / (block_size * block_size),
-      "Output depth should be equal to (Input depth / (block size * block size))");
-
-  return Status{};
-}
-#endif
-} // namespace
-
-CLDepthToSpaceKernel::CLDepthToSpaceKernel() : _input(nullptr), _output(nullptr)
-{
-  // DO NOTHING
-}
-
-void CLDepthToSpaceKernel::configure(const ICLTensor *input, ICLTensor *output,
-                                     const int32_t block_size)
-{
-  // TODO Add validation of data_layout
-  _input = input;
-  _output = output;
-
-  // Set kernel build options
-  auto layout_out = output->info()->data_layout();
-  std::set<std::string> build_opts;
-  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-  build_opts.emplace("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size));
-  auto index_depth = get_data_layout_dimension_index(layout_out, DataLayoutDimension::CHANNEL);
-  auto depth = output->info()->dimension(index_depth);
-  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(depth));
-  build_opts.emplace("-DZ_OUT=" + support::cpp11::to_string(output->info()->tensor_shape().z()));
-
-  // Create kernel
-  _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(
-      "depth_to_space_" + lower_string(string_from_data_layout(layout_out)), build_opts));
-
-  // Configure  kernel window
-  Window win = calculate_max_window(*output->info(), Steps());
-
-  Coordinates coord;
-  coord.set_num_dimensions(output->info()->num_dimensions());
-  output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
-
-  ICLKernel::configure_internal(win);
-}
-
-void CLDepthToSpaceKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
-  Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
-
-  // Setup input slice
-  Window slice_in(slice_out);
-  slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-  slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-  slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-  slice_in.set(3, Window::Dimension(0, 0, 0));
-
-  do
-  {
-    unsigned int idx = 0;
-    add_4D_tensor_argument(idx, _input, slice_in);
-    add_4D_tensor_argument(idx, _output, slice_out);
-    enqueue(queue, *this, slice_out);
-  } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
-}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
index 79f5ce0..67aaf2d 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
@@ -43,6 +43,7 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibraryEx.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "support/StringSupport.h"
 
 using namespace arm_compute;
 
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.cpp
deleted file mode 100644
index 235e897..0000000
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.cpp
+++ /dev/null
@@ -1,372 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/AccessWindowTranspose.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "support/ToolchainSupport.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <tuple>
-
-using namespace arm_compute;
-using namespace arm_compute::misc::shape_calculator;
-
-namespace arm_compute
-{
-class Coordinates;
-} // namespace arm_compute
-
-namespace
-{
-using ElementsProcessed = Steps;
-
-Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1,
-                          const ITensorInfo *output, const GEMMReshapeInfo &gemm_info)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::S8);
-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4,
-                                  "The number of dimensions for the matrix A must be <= 4");
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3,
-                                  "The number of dimensions for the matrix B must be <= 3");
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 2 &&
-                                      gemm_info.reinterpret_input_as_3d(),
-                                  "The input1 tensor cannot have more than 2 dimensions if input0 "
-                                  "has to be reinterpreted as 3D");
-
-  const int m = gemm_info.m();
-  const int n = gemm_info.n();
-  const int k = gemm_info.k();
-
-  ARM_COMPUTE_UNUSED(m);
-  ARM_COMPUTE_UNUSED(n);
-  ARM_COMPUTE_UNUSED(k);
-
-  ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != static_cast<unsigned int>(k));
-  ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) != static_cast<unsigned int>(n));
-  ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(1) != static_cast<unsigned int>(k));
-  if (gemm_info.reinterpret_input_as_3d())
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) * input0->dimension(2) !=
-                                static_cast<unsigned int>(m));
-  }
-  else
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != static_cast<unsigned int>(m));
-  }
-
-  if (output->total_size() != 0)
-  {
-    const TensorInfo tensor_info_output =
-        output->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, false, gemm_info));
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
-  }
-
-  return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1,
-                                                        ITensorInfo *output,
-                                                        const GEMMReshapeInfo &gemm_info,
-                                                        ElementsProcessed &num_elements_processed)
-{
-  unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
-  unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
-  bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-  bool reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
-
-  Window win{};
-  Window win_out{};
-  bool window_changed = false;
-
-  // In case both input and output have to be reinterpreted as 3D tensors,
-  // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
-  if (reinterpret_input_as_3d == reinterpret_output_as_3d)
-  {
-    reinterpret_input_as_3d = false;
-    reinterpret_output_as_3d = false;
-  }
-
-  // Output tensor auto inizialitation if not yet initialized
-  auto_init_if_empty(*output,
-                     input0->clone()
-                         ->set_tensor_shape(compute_mm_shape(*input0, *input1, false, gemm_info))
-                         .set_data_type(DataType::S32));
-
-  TensorInfo tmp_info(*output);
-
-  if (reinterpret_output_as_3d)
-  {
-    // Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D
-    // GEMM,
-    // the window needs to be constructed on the 2D collapsed version of the tensor
-    TensorShape tmp_shape(output->tensor_shape());
-    tmp_shape.collapse(2U, 1U);
-    tmp_info.set_tensor_shape(tmp_shape);
-  }
-
-  // Special case for 1xN, 2xN, 3xN and 4xN input0 tensor. num_elems_processed_per_iteration_x
-  // Note: if the dot product instruction is available, the 8x2 tile has to be used
-  num_elems_processed_per_iteration_x = 4;
-  num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->dimension(1)), 4);
-
-  // Note: bottom paddings are calculated manually as the output can be reinterpreted as 3D tensor
-  // The only way to set properly the paddings, it is to set those explicitly through the
-  // AccessWindowStatic
-  const int m = reinterpret_input_as_3d ? input0->tensor_shape()[1] * input0->tensor_shape()[2]
-                                        : input0->tensor_shape()[1];
-  const int bottom_pad =
-      (num_elems_processed_per_iteration_y - (m % num_elems_processed_per_iteration_y)) %
-      num_elems_processed_per_iteration_y;
-
-  // Configure window
-  win = calculate_max_window(
-      tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-  win_out = calculate_max_window(
-      *output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
-  AccessWindowStatic input0_access(input0, 0, 0, input0->dimension(0),
-                                   input0->dimension(1) + bottom_pad);
-  AccessWindowStatic input1_access(
-      input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x),
-      input1->dimension(1));
-  AccessWindowStatic output_access(
-      output, 0, 0, ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration_x),
-      output->dimension(1) + bottom_pad);
-
-  window_changed =
-      update_window_and_padding(win, input0_access,
-                                input1_access) || // window used by the execute_window_loop
-      update_window_and_padding(
-          win_out,
-          output_access); // window used to update the padding requirements of output tensor
-
-  Coordinates coord;
-  coord.set_num_dimensions(output->num_dimensions());
-  output_access.set_valid_region(win_out, ValidRegion(coord, output->tensor_shape()));
-
-  // Collapse along the Z direction
-  // This collapse needs to be here in order to tune the Z dimension of LWS
-  Window collapsed = win;
-  const unsigned int dimension_to_collapse =
-      std::min(static_cast<unsigned int>(output->num_dimensions()), 2u);
-  collapsed = win.collapse(win, dimension_to_collapse);
-
-  Status err = (window_changed)
-                   ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
-                   : Status{};
-  return std::make_pair(err, collapsed);
-}
-} // namespace
-
-CLGEMMLowpMatrixMultiplyKernelEx::CLGEMMLowpMatrixMultiplyKernelEx()
-    : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true),
-      _reinterpret_input_as_3d(false), _reinterpret_output_as_3d(false)
-{
-}
-
-void CLGEMMLowpMatrixMultiplyKernelEx::configure(const ICLTensor *input0, const ICLTensor *input1,
-                                                 ICLTensor *output,
-                                                 const GEMMReshapeInfo &gemm_info)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
-
-  ARM_COMPUTE_ERROR_THROW_ON(
-      validate_arguments(input0->info(), input1->info(), output->info(), gemm_info));
-
-  _input0 = input0;
-  _input1 = input1;
-  _output = output;
-  _reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-  _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
-
-  // In case both input and output have to be reinterpreted as 3D tensors,
-  // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
-  if (_reinterpret_input_as_3d == _reinterpret_output_as_3d)
-  {
-    _reinterpret_input_as_3d = false;
-    _reinterpret_output_as_3d = false;
-  }
-
-  // Check if we need to slide the matrix B
-  const unsigned int num_dimensions_input0 = _reinterpret_input_as_3d
-                                                 ? _input0->info()->num_dimensions() - 1
-                                                 : _input0->info()->num_dimensions();
-  _slide_matrix_b = (_input1->info()->num_dimensions() >= num_dimensions_input0);
-
-  ElementsProcessed num_elements_processed{};
-
-  // Configure kernel window
-  auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(),
-                                                  gemm_info, num_elements_processed);
-  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-  ICLKernel::configure_internal(win_config.second);
-
-  // Create build options
-  std::string kernel_name(" ");
-  CLBuildOptions build_opts;
-  build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
-  build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
-  build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
-                           "-DHEIGHT_GEMM3D=" +
-                               support::cpp11::to_string(output->info()->dimension(1)));
-  build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
-                           "-DDEPTH_GEMM3D=" +
-                               support::cpp11::to_string(output->info()->dimension(2)));
-  build_opts.add_option_if(!_slide_matrix_b,
-                           "-DMATRIX_B_DEPTH=" +
-                               support::cpp11::to_string(input1->info()->dimension(2)));
-  build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(input0->info()->dimension(0)));
-  build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_X=" +
-                        support::cpp11::to_string(num_elements_processed.x()));
-  build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_Y=" +
-                        support::cpp11::to_string(num_elements_processed.y()));
-
-  kernel_name = "gemmlowp_mm_midgard_ex";
-
-  // Create kernel
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts.options()));
-
-  // Set config_id for enabling LWS tuning
-  _config_id = kernel_name;
-  _config_id += "_";
-  _config_id += (_reinterpret_input_as_3d ? "3di_" : "");
-  _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
-  _config_id += lower_string(string_from_data_type(input0->info()->data_type()));
-  _config_id += "_";
-  _config_id += support::cpp11::to_string(output->info()->dimension(1));
-  _config_id += "_";
-  _config_id += support::cpp11::to_string(output->info()->dimension(0));
-}
-
-Status CLGEMMLowpMatrixMultiplyKernelEx::validate(const ITensorInfo *input0,
-                                                  const ITensorInfo *input1,
-                                                  const ITensorInfo *output,
-                                                  const GEMMReshapeInfo &gemm_info)
-{
-  ElementsProcessed num_elements_processed{};
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, gemm_info));
-  ARM_COMPUTE_RETURN_ON_ERROR(
-      validate_and_configure_window(input0->clone().get(), input1->clone().get(),
-                                    output->clone().get(), gemm_info, num_elements_processed)
-          .first);
-
-  return Status{};
-}
-
-void CLGEMMLowpMatrixMultiplyKernelEx::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-  if (_input1->info()->num_dimensions() < 3)
-  {
-    // The stride_z for matrix B must be zero if we do not slice
-    ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);
-  }
-
-  Window slice = window.first_slice_window_3D();
-  Window slice_matrix_b = slice;
-
-  slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
-  slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-  if (_reinterpret_input_as_3d)
-  {
-    // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
-    const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3;
-    const unsigned int total_cross_plane_pad =
-        _input0->info()->padding().top + _input0->info()->padding().bottom;
-    _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
-  }
-
-  if (_reinterpret_output_as_3d)
-  {
-    // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
-    const unsigned int idx0 =
-        3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
-    const unsigned int total_cross_plane_pad =
-        _output->info()->padding().top + _output->info()->padding().bottom;
-    _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
-  }
-
-  do
-  {
-    Window slice_b = slice;
-    // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A
-    // more than 2
-    // This scenario can happen when the matrix multiplication is used to perform a convolution
-    // operation
-    if (!_slide_matrix_b)
-    {
-      slice_b = slice_matrix_b;
-    }
-
-    unsigned int idx = 0;
-    add_2D_tensor_argument(idx, _input0, slice);
-    add_2D_tensor_argument(idx, _input1, slice_b);
-    add_2D_tensor_argument(idx, _output, slice);
-    _kernel.setArg<cl_uint>(idx++,
-                            static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
-    _kernel.setArg<cl_uint>(idx++,
-                            static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
-    _kernel.setArg<cl_uint>(idx++,
-                            static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
-    enqueue(queue, *this, slice, lws_hint());
-  } while (window.slide_window_slice_3D(slice));
-}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp
index 3a25987..3bfe3e4 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp
@@ -45,6 +45,7 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
 #include "arm_compute/core/UtilsEx.h"
+#include "support/StringSupport.h"
 
 using namespace arm_compute;
 
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
index 7fbdcda..930e7c9 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
@@ -43,6 +43,7 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibraryEx.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "support/StringSupport.h"
 
 using namespace arm_compute;
 
@@ -110,7 +111,7 @@ void CLHashtableLookupKernel::configure(const ICLTensor *lookups, const ICLTenso
   _hits = hits;
 
   // Make _lookup_indices tensor
-  _lookup_indices = arm_compute::support::cpp14::make_unique<CLTensor>();
+  _lookup_indices = support::cpp14::make_unique<CLTensor>();
   _lookup_indices->allocator()->init(
       TensorInfo(lookups->info()->tensor_shape(), lookups->info()->num_channels(), DataType::S32));
   _lookup_indices->allocator()->allocate();
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp
index b45f6bb..61c14d2 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp
@@ -48,7 +48,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Window.h"
-
+#include "support/StringSupport.h"
 #include "support/ToolchainSupport.h"
 
 namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp
index d305896..6b27c99 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp
@@ -49,6 +49,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "support/StringSupport.h"
 
 using namespace arm_compute;
 
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
index 74f7b41..643c8b1 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
@@ -43,6 +43,7 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibraryEx.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "support/StringSupport.h"
 
 using namespace arm_compute;
 
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp
deleted file mode 100644
index 8910a7b..0000000
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp
+++ /dev/null
@@ -1,210 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/CL/kernels/CLPReLUKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-Status validate_info(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output)
-{
-  const TensorShape &out_shape =
-      TensorShape::broadcast_shape(input->tensor_shape(), alpha->tensor_shape());
-
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32,
-                                                       DataType::QASYMM8);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(alpha, 1, DataType::F16, DataType::F32,
-                                                       DataType::QASYMM8);
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
-                                  "Inputs are not broadcast compatible");
-  // Validate in case of configured output
-  if (output->total_size() > 0)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32,
-                                                         DataType::QASYMM8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-        detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
-        "Wrong shape for output");
-  }
-  return Status{};
-}
-} // namespace
-
-CLPReLUKernel::CLPReLUKernel() : _input(nullptr), _alpha(nullptr), _output(nullptr) {}
-
-void CLPReLUKernel::configure(const ICLTensor *input, const ICLTensor *alpha, ICLTensor *output)
-{
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, alpha);
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-  ARM_COMPUTE_ERROR_THROW_ON(validate_info(input->info(), alpha->info(), output->info()));
-
-  _input = input;
-  _alpha = alpha;
-  _output = output;
-
-  // Create kernel
-  std::string kernel_name = "prelu";
-  std::set<std::string> build_opts;
-  build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
-  build_opts.emplace(
-      ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
-
-  if (is_data_type_quantized_asymmetric(input->info()->data_type()))
-  {
-    build_opts.emplace("-DOFF_IN=" + support::cpp11::to_string(
-                                         input->info()->quantization_info().uniform().offset));
-    build_opts.emplace("-DOFF_ALPHA=" + support::cpp11::to_string(
-                                            alpha->info()->quantization_info().uniform().offset));
-    build_opts.emplace("-DOFF_OUT=" + support::cpp11::to_string(
-                                          output->info()->quantization_info().uniform().offset));
-    build_opts.emplace("-DSCALE_IN=" + support::cpp11::to_string(
-                                           input->info()->quantization_info().uniform().scale));
-    build_opts.emplace("-DSCALE_ALPHA=" + support::cpp11::to_string(
-                                              alpha->info()->quantization_info().uniform().scale));
-    build_opts.emplace("-DSCALE_OUT=" + support::cpp11::to_string(
-                                            output->info()->quantization_info().uniform().scale));
-    kernel_name += "_qasymm8";
-  }
-  _kernel =
-      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
-
-  const std::pair<TensorShape, ValidRegion> broadcast_pair =
-      ITensorInfo::broadcast_shape_and_valid_region(*input->info(), *alpha->info());
-
-  const TensorShape &out_shape = broadcast_pair.first;
-  const ValidRegion &valid_region = broadcast_pair.second;
-
-  // Auto initialize output if not initialized
-  {
-    set_shape_if_empty(*output->info(), out_shape);
-
-    if (input->info()->data_type() == DataType::F16 && alpha->info()->data_type() == DataType::F16)
-    {
-      set_format_if_unknown(*output->info(), Format::F16);
-    }
-    else if (input->info()->data_type() == DataType::F32 ||
-             alpha->info()->data_type() == DataType::F32)
-    {
-      set_format_if_unknown(*output->info(), Format::F32);
-    }
-  }
-
-  Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
-  Window win_input1 = win.broadcast_if_dimension_le_one(*input->info());
-  Window win_input2 = win.broadcast_if_dimension_le_one(*alpha->info());
-
-  AccessWindowHorizontal input1_access(input->info(), 0, num_elems_processed_per_iteration);
-  AccessWindowHorizontal input2_access(alpha->info(), 0, num_elems_processed_per_iteration);
-  AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
-  update_window_and_padding(win_input1, input1_access) ||
-      update_window_and_padding(win_input2, input2_access) ||
-      update_window_and_padding(win, output_access);
-
-  output_access.set_valid_region(win, valid_region);
-
-  ICLKernel::configure_internal(win);
-}
-
-void CLPReLUKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-  const TensorShape &in_shape1 = _input->info()->tensor_shape();
-  const TensorShape &in_shape2 = _alpha->info()->tensor_shape();
-  const TensorShape &out_shape = _output->info()->tensor_shape();
-
-  bool can_collapse = true;
-  if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
-  {
-    can_collapse =
-        (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
-    for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
-    {
-      can_collapse = (in_shape1[d] == in_shape2[d]);
-    }
-  }
-
-  bool has_collapsed = false;
-  Window collapsed =
-      can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
-                   : window;
-
-  const TensorShape &in_shape1_collapsed =
-      has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
-  const TensorShape &in_shape2_collapsed =
-      has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
-
-  Window slice = collapsed.first_slice_window_3D();
-  Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
-  Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
-
-  do
-  {
-    unsigned int idx = 0;
-    add_3D_tensor_argument(idx, _input, slice_input1);
-    add_3D_tensor_argument(idx, _alpha, slice_input2);
-    add_3D_tensor_argument(idx, _output, slice);
-
-    enqueue(queue, *this, slice);
-
-    collapsed.slide_window_slice_3D(slice_input1);
-    collapsed.slide_window_slice_3D(slice_input2);
-  } while (collapsed.slide_window_slice_3D(slice));
-}
-
-BorderSize CLPReLUKernel::border_size() const
-{
-  const unsigned int replicateSize =
-      _output->info()->dimension(0) -
-      std::min(_input->info()->dimension(0), _alpha->info()->dimension(0));
-  const unsigned int border =
-      std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
-  return BorderSize(0, border, 0, 0);
-}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp
index 2d551f6..1a7a18c 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp
@@ -49,6 +49,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "support/StringSupport.h"
 
 namespace arm_compute
 {
@@ -69,7 +70,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *scale_fac
 
   // Output must always be initialized
   ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape().total_size() == 0);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S8);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8_SIGNED);
   ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
 
   return Status{};
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
index a983183..06c2579 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
@@ -43,6 +43,7 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibraryEx.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "support/StringSupport.h"
 
 using namespace arm_compute;
 namespace
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp
index ff1904a..8d8853c 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp
@@ -48,6 +48,7 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "support/StringSupport.h"
 
 #include <climits>
 
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp
deleted file mode 100644
index 64fc038..0000000
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
-                          const int32_t block_size)
-{
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
-                                                DataType::S16, DataType::S32, DataType::F16,
-                                                DataType::F32);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
-                                                DataType::S16, DataType::S32, DataType::F16,
-                                                DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size < 1,
-                                  "Block size should be greater than or equal to 1.");
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(3) != output->dimension(3),
-                                  "Input batch should be equal to Output batch");
-
-  auto layout_out = input->data_layout();
-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
-
-  auto index_depth = get_data_layout_dimension_index(layout_out, DataLayoutDimension::CHANNEL);
-  auto index_height = get_data_layout_dimension_index(layout_out, DataLayoutDimension::HEIGHT);
-  auto index_width = get_data_layout_dimension_index(layout_out, DataLayoutDimension::WIDTH);
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-      input->dimension(index_depth) * block_size * block_size != output->dimension(index_depth),
-      "Output depth should be equal to (input depth * block size *block size)");
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->dimension(index_width) % block_size) ||
-                                      (input->dimension(index_height) % block_size),
-                                  "Input height and width should be divisible by block size");
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-      (output->dimension(index_width) != (input->dimension(index_width) / block_size)) ||
-          (output->dimension(index_height) != (input->dimension(index_height) / block_size)),
-      "Output height and width should be equal to "
-      "input_height/blocksize and input_width/blocksize respectively");
-
-  return Status{};
-}
-
-} // namespace
-
-CLSpaceToDepthKernel::CLSpaceToDepthKernel() : _input(nullptr), _output(nullptr) {}
-
-void CLSpaceToDepthKernel::configure(const ICLTensor *input, ICLTensor *output,
-                                     const int32_t block_size)
-{
-
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_size));
-
-  _input = input;
-  _output = output;
-
-  // Set kernel build options
-  auto layout_out = input->info()->data_layout();
-  std::set<std::string> build_opts;
-  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-  build_opts.emplace("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size));
-  auto index_depth = get_data_layout_dimension_index(layout_out, DataLayoutDimension::CHANNEL);
-  auto depth = input->info()->dimension(index_depth);
-  build_opts.emplace("-DDEPTH_IN=" + support::cpp11::to_string(depth));
-  build_opts.emplace("-DZ_IN=" + support::cpp11::to_string(input->info()->tensor_shape().z()));
-
-  // Create kernel
-  _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(
-      "space_to_depth_" + lower_string(string_from_data_layout(layout_out)), build_opts));
-
-  // Configure  kernel window
-  Window win = calculate_max_window(*input->info(), Steps());
-
-  Coordinates coord;
-  coord.set_num_dimensions(output->info()->num_dimensions());
-  output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
-
-  ICLKernel::configure_internal(win);
-}
-
-void CLSpaceToDepthKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
-  Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
-
-  // Setup output slice
-  Window slice_out(slice_in);
-  slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
-  slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
-  slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
-  slice_out.set(3, Window::Dimension(0, 0, 0));
-
-  do
-  {
-    unsigned int idx = 0;
-    add_4D_tensor_argument(idx, _input, slice_in);
-    add_4D_tensor_argument(idx, _output, slice_out);
-    enqueue(queue, *this, slice_in);
-  } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
-}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.cpp
deleted file mode 100644
index 61999cb..0000000
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.cpp
+++ /dev/null
@@ -1,188 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-using namespace arm_compute;
-
-CLTransposeConvLayerUpsampleKernel::CLTransposeConvLayerUpsampleKernel()
-    : _input(nullptr), _output(nullptr), _inner_border(), _info()
-{
-}
-
-Status CLTransposeConvLayerUpsampleKernel::validate(const ITensorInfo *input,
-                                                    const ITensorInfo *output,
-                                                    const BorderSize &inner_border,
-                                                    const PadStrideInfo &info)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
-                                                       DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
-
-  const DataLayout data_layout = input->data_layout();
-
-  const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-  const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-  const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-
-  ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_w) == 0);
-  ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_h) == 0);
-
-  ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_c) != output->dimension(idx_c));
-  for (size_t i = 3; i < Coordinates::num_max_dimensions; ++i)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(i) != output->dimension(i));
-  }
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border.right > info.stride().first - 1,
-                                  "inner_border_right must be smaller that stride_x");
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border.top > info.stride().second - 1,
-                                  "inner_border_top must be smaller that stride_y");
-
-  return Status{};
-}
-
-void CLTransposeConvLayerUpsampleKernel::configure(const ICLTensor *input, ICLTensor *output,
-                                                   const BorderSize &inner_border,
-                                                   const PadStrideInfo &info)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-  _input = input;
-  _output = output;
-  _inner_border = inner_border;
-  _info = info;
-
-  // Perform validation step
-  ARM_COMPUTE_ERROR_THROW_ON(CLTransposeConvLayerUpsampleKernel::validate(
-      input->info(), output->info(), inner_border, info));
-
-  // Create kernel
-  CLBuildOptions build_opts;
-  build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibrary::get().create_kernel("deconvolution_upsample", build_opts.options()));
-
-  constexpr unsigned int num_elems_processed_per_iteration = 1;
-
-  // Configure kernel window
-  Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
-  AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-  output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
-  ICLKernel::configure_internal(win);
-}
-
-void CLTransposeConvLayerUpsampleKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-  const DataLayout data_layout = _input->info()->data_layout();
-
-  const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-  const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-
-  const int out_start_x = _info.pad_left();
-  const int out_end_x = _output->info()->dimension(idx_w) - _inner_border.right -
-                        _info.pad_right() + _info.stride().first - 1;
-  const int out_step_x = _info.stride().first;
-
-  const int out_start_y = _inner_border.top + _info.pad_top();
-  const int out_end_y =
-      _output->info()->dimension(idx_h) - _info.pad_bottom() + _info.stride().second - 1;
-  const int out_step_y = _info.stride().second;
-
-  switch (data_layout)
-  {
-    case DataLayout::NCHW:
-    {
-      Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-
-      Window slice_out = collapsed.first_slice_window_3D();
-      slice_out.set(Window::DimX, Window::Dimension(out_start_x, out_end_x, out_step_x));
-      slice_out.set(Window::DimY, Window::Dimension(out_start_y, out_end_y, out_step_y));
-
-      Window slice_in = collapsed.first_slice_window_3D();
-
-      do
-      {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, slice_in);
-        add_3D_tensor_argument(idx, _output, slice_out);
-        enqueue(queue, *this, slice_out);
-      } while (collapsed.slide_window_slice_3D(slice_in) &&
-               collapsed.slide_window_slice_3D(slice_out));
-      break;
-    }
-    case DataLayout::NHWC:
-    {
-      // NOTE: not collapsing in NHWC
-      Window slice_out = window.first_slice_window_3D();
-      slice_out.set(Window::DimY, Window::Dimension(out_start_x, out_end_x, out_step_x));
-      slice_out.set(Window::DimZ, Window::Dimension(out_start_y, out_end_y, out_step_y));
-
-      Window slice_in = window.first_slice_window_3D();
-
-      do
-      {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, slice_in);
-        add_3D_tensor_argument(idx, _output, slice_out);
-        enqueue(queue, *this, slice_out);
-      } while (window.slide_window_slice_3D(slice_in) && window.slide_window_slice_3D(slice_out));
-      break;
-    }
-    default:
-      ARM_COMPUTE_ERROR("Unsupported data layout");
-  }
-}
diff --git a/compute/ARMComputeEx/src/core/CPP/kernels/CPPUpsampleKernelEx.cpp b/compute/ARMComputeEx/src/core/CPP/kernels/CPPUpsampleKernelEx.cpp
deleted file mode 100644
index 648afb3..0000000
--- a/compute/ARMComputeEx/src/core/CPP/kernels/CPPUpsampleKernelEx.cpp
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-
-#include <cstddef>
-#include <cstdint>
-
-namespace arm_compute
-{
-CPPUpsampleKernelEx::CPPUpsampleKernelEx() : _input(nullptr), _output(nullptr), _info() {}
-
-bool CPPUpsampleKernelEx::is_parallelisable() const { return false; }
-
-void CPPUpsampleKernelEx::configure(const ITensor *input, ITensor *output,
-                                    const PadStrideInfo &info)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-  _input = input;
-  _output = output;
-  _info = info;
-
-  // Configure kernel window
-  Window win = calculate_max_window(*input->info(), Steps());
-
-  // The CPPUpsampleKernelEx doesn't need padding so update_window_and_padding() can be skipped
-  Coordinates coord;
-  coord.set_num_dimensions(output->info()->num_dimensions());
-  output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
-
-  ICPPKernel::configure(win);
-}
-
-void CPPUpsampleKernelEx::run(const Window &window, const ThreadInfo &info)
-{
-  ARM_COMPUTE_UNUSED(info);
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
-
-  // Initialize _scaled_output buffer
-  const int width_scaled = _output->info()->dimension(0);
-  const int height_scaled = _output->info()->dimension(1);
-  const int stride_x = _info.stride().first;
-  const int stride_y = _info.stride().second;
-  const int start_x = _info.pad_left();
-  const int start_y = _info.pad_top();
-  const int end_y = height_scaled - _info.pad_bottom();
-  const int end_x = width_scaled - _info.pad_top();
-  const size_t element_size = _input->info()->element_size();
-
-  // The fill value is normally 0, but for QASYMM8 the '0' corresponds to the offset
-  const uint8_t fill_value =
-      _output->info()->data_type() == DataType::QASYMM8
-          ? utility::clamp<uint8_t>(_output->info()->quantization_info().uniform().offset)
-          : 0;
-  // Filling a value different than 0 works only for QASYMM8 datatype since we are filling 1byte
-  // values in a buffer of uint8_ts
-  std::fill_n(_output->buffer(), _output->info()->total_size(), fill_value);
-
-  // Create window
-  Window window_out(window);
-  window_out.set(Window::DimX, Window::Dimension(start_x, end_x, stride_x));
-  window_out.set(Window::DimY, Window::Dimension(start_y, end_y, stride_y));
-
-  // Create iterators
-  Iterator in(_input, window);
-  Iterator out(_output, window_out);
-
-  execute_window_loop(
-      window, [&](const Coordinates &) { memcpy(out.ptr(), in.ptr(), element_size); }, in, out);
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NECastKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NECastKernel.cpp
deleted file mode 100644
index fbb9dbc..0000000
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NECastKernel.cpp
+++ /dev/null
@@ -1,671 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/NECastKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CPP/Validate.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
-                          SubDataType input_subtype)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8,
-                                                       DataType::QASYMM8, DataType::U32,
-                                                       DataType::S32, DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON(input_subtype == SubDataType::BOOL &&
-                              input->data_type() != DataType::U8);
-
-  if (output->tensor_shape().total_size() > 0)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S8,
-                                                         DataType::QASYMM8, DataType::U32,
-                                                         DataType::S32, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
-  }
-
-  return Status{};
-}
-
-std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
-  // Configure kernel window
-  Window win = calculate_max_window(*input, Steps());
-
-  // Output tensor auto initialization if not yet initialized
-  auto_init_if_empty(*output, input->tensor_shape(), 1, DataType::F32);
-
-  // NECastKernel doesn't need padding so update_window_and_padding() can be skipped
-  Coordinates coord;
-  coord.set_num_dimensions(output->num_dimensions());
-  output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
-
-  return std::make_tuple(Status{}, win);
-}
-
-typedef struct bool8x16
-{
-  uint8x16_t val;
-} bool8x16_t;
-
-static inline uint8x16_t vreinterpretq_u8_b8(bool8x16_t __a) { return (uint8x16_t)__a.val; }
-
-template <typename ToV, typename FromV> inline ToV vcast(const FromV &v) { return v; }
-template <> inline uint8x16_t vcast(const bool8x16_t &v)
-{
-  const uint8x16_t vu8 = vreinterpretq_u8_b8(v);
-  const uint8x16_t zero_uint8x16 = vdupq_n_u8(0);
-  uint8x16_t mask = vcgtq_u8(vu8, zero_uint8x16);
-  return vshrq_n_u8(mask, 7); // true -> 1, false -> 0
-}
-
-template <> inline uint32x4x4_t vcast(const bool8x16_t &v)
-{
-  const uint8x16_t vu8 = vreinterpretq_u8_b8(v);
-  const uint8x16_t zero_uint8x16 = vdupq_n_u8(0);
-  uint8x16_t mask = vcgtq_u8(vu8, zero_uint8x16);
-  uint8x16_t vb = vshrq_n_u8(mask, 7); // true -> 1, false -> 0
-
-  const uint32x4x4_t ret = {{
-      vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vb)))),
-      vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(vb)))),
-      vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(vb)))),
-      vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(vb)))),
-  }};
-
-  return ret;
-}
-
-template <> inline int32x4x4_t vcast(const bool8x16_t &v)
-{
-  const uint8x16_t vu8 = vreinterpretq_u8_b8(v);
-  const uint8x16_t zero_uint8x16 = vdupq_n_u8(0);
-  uint8x16_t mask = vcgtq_u8(vu8, zero_uint8x16);
-  uint8x16_t vb = vshrq_n_u8(mask, 7); // true -> 1, false -> 0
-
-  const int32x4x4_t ret = {{
-      vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vb))))),
-      vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(vb))))),
-      vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(vb))))),
-      vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(vb))))),
-  }};
-
-  return ret;
-}
-
-template <> inline float32x4x4_t vcast(const bool8x16_t &v)
-{
-  const uint8x16_t vu8 = vreinterpretq_u8_b8(v);
-  const uint8x16_t zero_uint8x16 = vdupq_n_u8(0);
-  uint8x16_t mask = vcgtq_u8(vu8, zero_uint8x16);
-  uint8x16_t vb = vshrq_n_u8(mask, 7); // true -> 1, false -> 0
-
-  const float32x4x4_t ret = {{
-      vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vb))))),
-      vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(vb))))),
-      vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(vb))))),
-      vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(vb))))),
-  }};
-
-  return ret;
-}
-
-template <> inline uint32x4x4_t vcast(const uint8x16_t &v)
-{
-  const uint32x4x4_t ret = {{
-      vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(v)))),
-      vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(v)))),
-      vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(v)))),
-      vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(v)))),
-  }};
-
-  return ret;
-}
-
-template <> inline int32x4x4_t vcast(const uint8x16_t &v)
-{
-  const int32x4x4_t ret = {{
-      vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(v))))),
-      vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(v))))),
-      vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(v))))),
-      vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(v))))),
-  }};
-
-  return ret;
-}
-
-template <> inline float32x4x4_t vcast(const uint8x16_t &v)
-{
-  const float32x4x4_t ret = {{
-      vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(v))))),
-      vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(v))))),
-      vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(v))))),
-      vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(v))))),
-  }};
-
-  return ret;
-}
-
-template <> inline uint8x16_t vcast(const int32x4x4_t &v)
-{
-  // Saturate cast
-  return vcombine_u8(vqmovn_u16(vcombine_u16(vqmovun_s32(v.val[0]), vqmovun_s32(v.val[1]))),
-                     vqmovn_u16(vcombine_u16(vqmovun_s32(v.val[2]), vqmovun_s32(v.val[3]))));
-}
-
-template <> inline uint32x4x4_t vcast(const int32x4x4_t &v)
-{
-  // Saturate cast
-  const uint32x4x4_t ret = {{
-      vcombine_u32(vqmovun_s64(vmovl_s32(vget_low_s32(v.val[0]))),
-                   vqmovun_s64(vmovl_s32(vget_high_s32(v.val[0])))),
-      vcombine_u32(vqmovun_s64(vmovl_s32(vget_low_s32(v.val[1]))),
-                   vqmovun_s64(vmovl_s32(vget_high_s32(v.val[1])))),
-      vcombine_u32(vqmovun_s64(vmovl_s32(vget_low_s32(v.val[2]))),
-                   vqmovun_s64(vmovl_s32(vget_high_s32(v.val[2])))),
-      vcombine_u32(vqmovun_s64(vmovl_s32(vget_low_s32(v.val[3]))),
-                   vqmovun_s64(vmovl_s32(vget_high_s32(v.val[3])))),
-  }};
-
-  return ret;
-}
-
-template <> inline float32x4x4_t vcast(const int32x4x4_t &v)
-{
-  const float32x4x4_t ret = {{
-      vcvtq_f32_s32(v.val[0]), vcvtq_f32_s32(v.val[1]), vcvtq_f32_s32(v.val[2]),
-      vcvtq_f32_s32(v.val[3]),
-  }};
-
-  return ret;
-}
-
-template <> inline uint8x16_t vcast(const uint32x4x4_t &v)
-{
-  return vcombine_u8(vqmovn_u16(vcombine_u16(vqmovn_u32(v.val[0]), vqmovn_u32(v.val[1]))),
-                     vqmovn_u16(vcombine_u16(vqmovn_u32(v.val[2]), vqmovn_u32(v.val[3]))));
-}
-
-template <> inline int32x4x4_t vcast(const uint32x4x4_t &v)
-{
-  const int32x4x4_t ret = {{
-      vcombine_s32(vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_low_u32(v.val[0])))),
-                   vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_high_u32(v.val[0]))))),
-      vcombine_s32(vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_low_u32(v.val[1])))),
-                   vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_high_u32(v.val[1]))))),
-      vcombine_s32(vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_low_u32(v.val[2])))),
-                   vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_high_u32(v.val[2]))))),
-      vcombine_s32(vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_low_u32(v.val[3])))),
-                   vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_high_u32(v.val[3]))))),
-  }};
-
-  return ret;
-}
-
-template <> inline float32x4x4_t vcast(const uint32x4x4_t &v)
-{
-  const float32x4x4_t ret = {{
-      vcvtq_f32_u32(v.val[0]), vcvtq_f32_u32(v.val[1]), vcvtq_f32_u32(v.val[2]),
-      vcvtq_f32_u32(v.val[3]),
-  }};
-
-  return ret;
-}
-
-template <> inline uint8x16_t vcast(const float32x4x4_t &v)
-{
-  // Saturate cast
-  return vcombine_u8(vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(v.val[0])),
-                                             vqmovun_s32(vcvtq_s32_f32(v.val[1])))),
-                     vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(v.val[2])),
-                                             vqmovun_s32(vcvtq_s32_f32(v.val[3])))));
-}
-
-template <> inline uint32x4x4_t vcast(const float32x4x4_t &v)
-{
-  const uint32x4x4_t ret = {{
-      vcvtq_u32_f32(v.val[0]), vcvtq_u32_f32(v.val[1]), vcvtq_u32_f32(v.val[2]),
-      vcvtq_u32_f32(v.val[3]),
-  }};
-
-  return ret;
-}
-
-template <> inline int32x4x4_t vcast(const float32x4x4_t &v)
-{
-  const int32x4x4_t ret = {{
-      vcvtq_s32_f32(v.val[0]), vcvtq_s32_f32(v.val[1]), vcvtq_s32_f32(v.val[2]),
-      vcvtq_s32_f32(v.val[3]),
-  }};
-
-  return ret;
-}
-
-template <typename T> struct cast_vector;
-template <> struct cast_vector<bool>
-{
-  using type = bool8x16_t;
-};
-template <> struct cast_vector<uint8_t>
-{
-  using type = uint8x16_t;
-};
-template <> struct cast_vector<uint32_t>
-{
-  using type = uint32x4x4_t;
-};
-template <> struct cast_vector<int32_t>
-{
-  using type = int32x4x4_t;
-};
-template <> struct cast_vector<float>
-{
-  using type = float32x4x4_t;
-};
-
-template <typename T> inline void store_result(T *ptr, const typename cast_vector<T>::type &v)
-{
-  wrapper::vstore(ptr, v.val[0]);
-  wrapper::vstore(ptr + 4, v.val[1]);
-  wrapper::vstore(ptr + 8, v.val[2]);
-  wrapper::vstore(ptr + 12, v.val[3]);
-}
-
-template <> inline void store_result<uint8_t>(uint8_t *ptr, const uint8x16_t &v)
-{
-  wrapper::vstore(ptr, v);
-}
-
-inline bool8x16_t vloadq(const bool *ptr)
-{
-  bool8x16_t ret;
-  ret.val = wrapper::vloadq(reinterpret_cast<const uint8_t *>(ptr));
-  return ret;
-}
-
-template <typename T> inline typename cast_vector<T>::type load_input(const T *ptr)
-{
-  return wrapper::vloadq(ptr);
-}
-
-template <> inline typename cast_vector<bool>::type load_input(const bool *ptr)
-{
-  return vloadq(ptr);
-}
-
-template <> inline typename cast_vector<uint32_t>::type load_input(const uint32_t *ptr)
-{
-  return vld4q_u32(ptr);
-}
-
-template <> inline typename cast_vector<int32_t>::type load_input(const int32_t *ptr)
-{
-  return vld4q_s32(ptr);
-}
-
-template <> inline typename cast_vector<float>::type load_input(const float *ptr)
-{
-  return vld4q_f32(ptr);
-}
-
-template <typename T> inline T get_value(const T *ptr) { return *ptr; }
-
-template <> inline bool get_value(const bool *ptr)
-{
-  bool ret = (*ptr != 0);
-  return ret;
-}
-
-template <typename FromT> void run_cast(const ITensor *input, ITensor *output, const Window &window)
-{
-  const int window_step_x = 16;
-  const auto window_start_x = static_cast<int>(window.x().start());
-  const auto window_end_x = static_cast<int>(window.x().end());
-
-  // Collapse window and reset first dimension to handle tail calculations manually
-  Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-  win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-  // Create iterators
-  Iterator in(input, win_collapsed);
-  Iterator out(output, win_collapsed);
-
-#ifdef __aarch64__
-  constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN;
-#else  //__aarch64__
-  constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO;
-#endif //__aarch64__
-
-  execute_window_loop(
-      win_collapsed,
-      [&](const Coordinates &) {
-        const auto in_ptr = reinterpret_cast<const FromT *>(in.ptr());
-
-        int x = window_start_x;
-        for (; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-          using from_vector = typename cast_vector<FromT>::type;
-          const from_vector vin = load_input(in_ptr + x);
-
-          switch (output->info()->data_type())
-          {
-            case DataType::U8:
-            {
-              using to_vector = typename cast_vector<uint8_t>::type;
-              const to_vector vout = vcast<to_vector, from_vector>(vin);
-              store_result<uint8_t>(reinterpret_cast<uint8_t *>(out.ptr()) + x, vout);
-              break;
-            }
-            case DataType::QASYMM8:
-            {
-              using to_vector = typename cast_vector<float>::type;
-              const UniformQuantizationInfo &qinfo_out =
-                  output->info()->quantization_info().uniform();
-              const auto vf = vcast<to_vector, from_vector>(vin);
-              const auto vout = vquantize(vf, qinfo_out);
-              store_result<qasymm8_t>(reinterpret_cast<qasymm8_t *>(out.ptr()) + x, vout);
-              break;
-            }
-            case DataType::U32:
-            {
-              using to_vector = typename cast_vector<uint32_t>::type;
-              const to_vector vout = vcast<to_vector, from_vector>(vin);
-              store_result<uint32_t>(reinterpret_cast<uint32_t *>(out.ptr()) + x, vout);
-              break;
-            }
-            case DataType::S32:
-            {
-              using to_vector = typename cast_vector<int32_t>::type;
-              const to_vector vout = vcast<to_vector, from_vector>(vin);
-              store_result<int32_t>(reinterpret_cast<int32_t *>(out.ptr()) + x, vout);
-              break;
-            }
-            case DataType::F32:
-            {
-              using to_vector = typename cast_vector<float>::type;
-              const to_vector vout = vcast<to_vector, from_vector>(vin);
-              store_result<float>(reinterpret_cast<float *>(out.ptr()) + x, vout);
-              break;
-            }
-            default:
-              ARM_COMPUTE_ERROR("Unsupported data type.");
-          }
-        }
-
-        // Compute left-over elements
-        for (; x < window_end_x; ++x)
-        {
-          FromT val = get_value(in_ptr + x);
-          switch (output->info()->data_type())
-          {
-            case DataType::U8:
-            {
-              *(reinterpret_cast<uint8_t *>(out.ptr()) + x) = static_cast<uint8_t>(val);
-              break;
-            }
-            case DataType::QASYMM8:
-            {
-              const QuantizationInfo &qinfo_out = output->info()->quantization_info();
-              const auto qval =
-                  quantize_qasymm8(static_cast<float>(val), qinfo_out, rounding_policy);
-              *(reinterpret_cast<qasymm8_t *>(out.ptr()) + x) = qval;
-              break;
-            }
-            case DataType::U32:
-            {
-              *(reinterpret_cast<uint32_t *>(out.ptr()) + x) = static_cast<uint32_t>(val);
-              break;
-            }
-            case DataType::S32:
-            {
-              *(reinterpret_cast<int32_t *>(out.ptr()) + x) = static_cast<int32_t>(val);
-              break;
-            }
-            case DataType::F32:
-            {
-              *(reinterpret_cast<float *>(out.ptr()) + x) = static_cast<float>(val);
-              break;
-            }
-            default:
-              ARM_COMPUTE_ERROR("Unsupported data type.");
-          }
-        }
-      },
-      in, out);
-}
-
-void run_cast_qasymm8(const ITensor *input, ITensor *output, const Window &window)
-{
-  const int window_step_x = 16;
-  const auto window_start_x = static_cast<int>(window.x().start());
-  const auto window_end_x = static_cast<int>(window.x().end());
-
-  // Collapse window and reset first dimension to handle tail calculations manually
-  Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-  win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-  // Create iterators
-  Iterator in(input, win_collapsed);
-  Iterator out(output, win_collapsed);
-
-#ifdef __aarch64__
-  constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN;
-#else  //__aarch64__
-  constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO;
-#endif //__aarch64__
-  const auto &qinfo_in = input->info()->quantization_info().uniform();
-  const auto &qinfo_out = output->info()->quantization_info().uniform();
-
-  execute_window_loop(
-      win_collapsed,
-      [&](const Coordinates &) {
-        const auto in_ptr = reinterpret_cast<const qasymm8_t *>(in.ptr());
-
-        int x = window_start_x;
-        for (; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-          using from_vector = typename cast_vector<float>::type;
-          const auto vf = wrapper::vloadq(in_ptr + x);
-          const auto vin = vdequantize(vf, qinfo_in);
-          switch (output->info()->data_type())
-          {
-            case DataType::U8:
-            {
-              using to_vector = typename cast_vector<uint8_t>::type;
-              const to_vector vout = vcast<to_vector, from_vector>(vin);
-              store_result<uint8_t>(reinterpret_cast<uint8_t *>(out.ptr()) + x, vout);
-              break;
-            }
-            case DataType::QASYMM8:
-            {
-              using to_vector = typename cast_vector<float>::type;
-              const auto vf = vcast<to_vector, from_vector>(vin);
-              const auto vout = vquantize(vf, qinfo_out);
-              store_result<qasymm8_t>(reinterpret_cast<qasymm8_t *>(out.ptr()) + x, vout);
-              break;
-            }
-            case DataType::U32:
-            {
-              using to_vector = typename cast_vector<uint32_t>::type;
-              const to_vector vout = vcast<to_vector, from_vector>(vin);
-              store_result<uint32_t>(reinterpret_cast<uint32_t *>(out.ptr()) + x, vout);
-              break;
-            }
-            case DataType::S32:
-            {
-              using to_vector = typename cast_vector<int32_t>::type;
-              const to_vector vout = vcast<to_vector, from_vector>(vin);
-              store_result<int32_t>(reinterpret_cast<int32_t *>(out.ptr()) + x, vout);
-              break;
-            }
-            case DataType::F32:
-            {
-              using to_vector = typename cast_vector<float>::type;
-              const to_vector vout = vcast<to_vector, from_vector>(vin);
-              store_result<float>(reinterpret_cast<float *>(out.ptr()) + x, vout);
-              break;
-            }
-            default:
-              ARM_COMPUTE_ERROR("Unsupported data type.");
-          }
-        }
-
-        // Compute left-over elements
-        for (; x < window_end_x; ++x)
-        {
-          qasymm8_t qval_in = *(in_ptr + x);
-          const auto val = dequantize_qasymm8(qval_in, qinfo_in);
-
-          switch (output->info()->data_type())
-          {
-            case DataType::U8:
-            {
-              *(reinterpret_cast<uint8_t *>(out.ptr()) + x) = static_cast<uint8_t>(val);
-              break;
-            }
-            case DataType::QASYMM8:
-            {
-              const auto qval_out = quantize_qasymm8(val, qinfo_out, rounding_policy);
-              *(reinterpret_cast<qasymm8_t *>(out.ptr()) + x) = qval_out;
-              break;
-            }
-            case DataType::U32:
-            {
-              *(reinterpret_cast<uint32_t *>(out.ptr()) + x) = static_cast<uint32_t>(val);
-              break;
-            }
-            case DataType::S32:
-            {
-              *(reinterpret_cast<int32_t *>(out.ptr()) + x) = static_cast<int32_t>(val);
-              break;
-            }
-            case DataType::F32:
-            {
-              *(reinterpret_cast<float *>(out.ptr()) + x) = static_cast<float>(val);
-              break;
-            }
-            default:
-              ARM_COMPUTE_ERROR("Unsupported data type.");
-          }
-        }
-      },
-      in, out);
-}
-} // namespace
-
-NECastKernel::NECastKernel() : _input(nullptr), _output(nullptr), _input_subtype(SubDataType::NONE)
-{
-}
-
-void NECastKernel::configure(const ITensor *input, ITensor *output, SubDataType input_subtype)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), input_subtype));
-
-  _input = input;
-  _output = output;
-  _input_subtype = input_subtype;
-
-  // Configure kernel window
-  auto win_config = validate_and_configure_window(input->info(), output->info());
-
-  ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
-
-  INEKernel::configure(std::get<1>(win_config));
-}
-
-Status NECastKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
-                              SubDataType input_subtype)
-{
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, input_subtype));
-  ARM_COMPUTE_RETURN_ON_ERROR(
-      std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
-  return Status{};
-}
-
-void NECastKernel::run(const Window &window, const ThreadInfo &info)
-{
-  ARM_COMPUTE_UNUSED(info);
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
-  switch (_input->info()->data_type())
-  {
-    case DataType::U8:
-      if (_input_subtype == SubDataType::BOOL)
-      {
-        run_cast<bool>(_input, _output, window);
-      }
-      else
-      {
-        run_cast<uint8_t>(_input, _output, window);
-      }
-      break;
-    case DataType::QASYMM8:
-      run_cast_qasymm8(_input, _output, window);
-      break;
-    case DataType::U32:
-      run_cast<uint32_t>(_input, _output, window);
-      break;
-    case DataType::S32:
-      run_cast<int32_t>(_input, _output, window);
-      break;
-    case DataType::F32:
-      run_cast<float>(_input, _output, window);
-      break;
-    default:
-      ARM_COMPUTE_ERROR("Unsupported data type.");
-  }
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.cpp
deleted file mode 100644
index 95e269d..0000000
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.cpp
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
-#include <arm_neon.h>
-#include <cstdint>
-
-using namespace arm_compute::misc::shape_calculator;
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
-  ARM_COMPUTE_RETURN_ERROR_ON(block_shape < 2);
-
-  const DataLayout data_layout = input->data_layout();
-  const int idx_channel =
-      get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-  ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channel] % (block_shape * block_shape) !=
-                              0);
-  // Validate output if initialized
-  if (output->total_size() != 0)
-  {
-    const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int idx_height =
-        get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] !=
-                                (block_shape * input->tensor_shape()[idx_width]));
-    ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] !=
-                                (block_shape * input->tensor_shape()[idx_height]));
-    ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-  }
-
-  return Status{};
-}
-} // namespace
-
-NEDepthToSpaceLayerKernelEx::NEDepthToSpaceLayerKernelEx()
-    : _input(nullptr), _output(nullptr), _block_shape()
-{
-}
-
-void NEDepthToSpaceLayerKernelEx::configure(const ITensor *input, ITensor *output,
-                                            int32_t block_shape)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-  TensorShape output_shape = compute_depth_to_space_shape_ex(input->info(), block_shape);
-  // Output auto inizialitation if not yet initialized
-  auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
-
-  // Perform validation step
-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_shape));
-
-  _input = input;
-  _output = output;
-  _block_shape = block_shape;
-
-  // Configure kernel window
-  Window win = calculate_max_window(*input->info(), Steps());
-  ICPPKernel::configure(win);
-}
-
-Status NEDepthToSpaceLayerKernelEx::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                             int32_t block_shape)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, block_shape));
-  return Status{};
-}
-
-void NEDepthToSpaceLayerKernelEx::run(const Window &window, const ThreadInfo &info)
-{
-  ARM_COMPUTE_UNUSED(info);
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
-
-  const int idx_channel =
-      get_data_layout_dimension_index(_input->info()->data_layout(), DataLayoutDimension::CHANNEL);
-  const int depth_size = _input->info()->dimension(idx_channel);
-  const int r = (depth_size / (_block_shape * _block_shape));
-  const int element_size = _input->info()->element_size();
-
-  Window slice_out = window.first_slice_window_3D();
-
-  // The slice_out slice does not move
-  slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
-  slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
-  slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-  // Main loop for NCHW and NHWC
-  if (_input->info()->data_layout() == DataLayout::NCHW)
-  {
-    Window slice_in = window.first_slice_window_2D();
-    do
-    {
-      Iterator in(_input, slice_in);
-      execute_window_loop(slice_in,
-                          [&](const Coordinates &id) {
-                            const int x = id.x();
-                            const int y = id.y();
-
-                            const int z = id.z() % r;
-                            const int out_x = x * _block_shape + (id.z() / r) % _block_shape;
-                            const int out_y = y * _block_shape + (id.z() / r) / _block_shape;
-                            Coordinates output_coords{out_x, out_y, z, id[3]};
-                            memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size);
-                          },
-                          in);
-    } while (window.slide_window_slice_2D(slice_in));
-  }
-  else
-  {
-    Window slice_in = window.first_slice_window_3D();
-    do
-    {
-      Iterator in(_input, slice_in);
-      execute_window_loop(slice_in,
-                          [&](const Coordinates &id) {
-                            const int x = id.y();
-                            const int y = id.z();
-
-                            const int z = id.x() % r;
-                            const int out_x = x * _block_shape + (id.x() / r) % _block_shape;
-                            const int out_y = y * _block_shape + (id.x() / r) / _block_shape;
-                            Coordinates output_coords{z, out_x, out_y, id[3]};
-                            memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size);
-                          },
-                          in);
-    } while (window.slide_window_slice_3D(slice_in));
-  }
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEElementwiseUnaryKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEElementwiseUnaryKernelEx.cpp
deleted file mode 100644
index 200fc4f..0000000
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEElementwiseUnaryKernelEx.cpp
+++ /dev/null
@@ -1,221 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h"
-
-#include "arm_compute/core/CPP/Validate.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-
-#include <algorithm>
-#include <arm_neon.h>
-#include <cstdint>
-#include <map>
-#include <string>
-
-namespace arm_compute
-{
-class Coordinates;
-
-namespace
-{
-template <ElementWiseUnaryEx op, typename ScalarType>
-inline ScalarType elementwise_op_scalar(const ScalarType &a)
-{
-  switch (op)
-  {
-    case ElementWiseUnaryEx::NEG:
-      return -a;
-    default:
-      ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
-  }
-}
-
-template <ElementWiseUnaryEx op, typename VectorType>
-inline VectorType elementwise_op(const VectorType &a)
-{
-  switch (op)
-  {
-    case ElementWiseUnaryEx::NEG:
-      return wrapper::vneg(a);
-    default:
-      ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
-  }
-}
-
-template <ElementWiseUnaryEx op, typename ScalarType>
-void elementwise_op(const ITensor *in, ITensor *out, const Window &window)
-{
-  const int window_step_x = 16 / sizeof(ScalarType);
-  const auto window_start_x = static_cast<int>(window.x().start());
-  const auto window_end_x = static_cast<int>(window.x().end());
-
-  Window win = window;
-  win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-  Iterator input(in, win);
-  Iterator output(out, win);
-
-  execute_window_loop(win,
-                      [&](const Coordinates &) {
-                        auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
-                        const auto input_ptr = reinterpret_cast<const ScalarType *>(input.ptr());
-
-                        int x = window_start_x;
-                        for (; x <= window_end_x - window_step_x; x += window_step_x)
-                        {
-                          wrapper::vstore(output_ptr + x,
-                                          elementwise_op<op>(wrapper::vloadq(input_ptr + x)));
-                        }
-                        for (; x < window_end_x; ++x)
-                        {
-                          *(output_ptr + x) = elementwise_op_scalar<op>(*(input_ptr + x));
-                        }
-                      },
-                      input, output);
-}
-
-template <ElementWiseUnaryEx op>
-std::function<void(const ITensor *input, ITensor *output, const Window &window)>
-configure_func(const ITensor *input, ITensor *output)
-{
-  std::string function_to_call("op_");
-  function_to_call += string_from_data_type(input->info()->data_type()) + "_";
-  function_to_call += string_from_data_type(output->info()->data_type());
-
-  static std::map<std::string, NEElementwiseUnaryKernelEx::ElementwiseUnaryFunction *>
-      map_function = {
-          {"op_F32_F32", &elementwise_op<op, float>}, {"op_S32_S32", &elementwise_op<op, int32_t>},
-      };
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-  map_function["op_F16_F16"] = &elementwise_op<op, float16_t>;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-  auto it = map_function.find(function_to_call);
-
-  if (it != map_function.end())
-  {
-    auto func = it->second;
-    return [func](const ITensor *input, ITensor *output, const Window &window) {
-      func(input, output, window);
-    };
-  }
-  return nullptr;
-}
-} // namespace
-
-NEElementwiseUnaryKernelEx::NEElementwiseUnaryKernelEx()
-    : _function(nullptr), _input(nullptr), _output(nullptr)
-{
-}
-
-void NEElementwiseUnaryKernelEx::configure(ElementWiseUnaryEx op, const ITensor *input,
-                                           ITensor *output)
-{
-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input->info(), *output->info()));
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-  // Configure kernel window
-  const std::pair<TensorShape, ValidRegion> broadcast_pair =
-      ITensorInfo::broadcast_shape_and_valid_region(*input->info());
-  const TensorShape &out_shape = broadcast_pair.first;
-  const ValidRegion &valid_region = broadcast_pair.second;
-
-  // Auto initialize output if not initialized
-  auto_init_if_empty(*output->info(), out_shape, 1, input->info()->data_type());
-
-  Window win = calculate_max_window(valid_region);
-
-  _input = input;
-  _output = output;
-
-  INEKernel::configure(win);
-
-  switch (op)
-  {
-    case ElementWiseUnaryEx::NEG:
-      _function = configure_func<ElementWiseUnaryEx::NEG>(input, output);
-      break;
-    default:
-      ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
-  }
-}
-
-Status NEElementwiseUnaryKernelEx::validate_arguments(const ITensorInfo &input,
-                                                      const ITensorInfo &output)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::F16, DataType::F32,
-                                                       DataType::S32);
-
-  // Validate in case of configured output
-  if (output.total_size() > 0)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &output);
-  }
-
-  return Status{};
-}
-
-Status NEElementwiseUnaryKernelEx::validate(ElementWiseUnaryEx op, const ITensorInfo *input,
-                                            const ITensorInfo *output)
-{
-  ARM_COMPUTE_UNUSED(op);
-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input, *output));
-  return Status{};
-}
-
-void NEElementwiseUnaryKernelEx::run(const Window &window, const ThreadInfo &info)
-{
-  ARM_COMPUTE_UNUSED(info);
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-  ARM_COMPUTE_ERROR_ON(_function == nullptr);
-  _function(_input, _output, window);
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEPReLUKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEPReLUKernel.cpp
deleted file mode 100644
index 641641b..0000000
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEPReLUKernel.cpp
+++ /dev/null
@@ -1,291 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/NEPReLUKernel.h"
-
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/NEElementwiseOperationFuncs.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Window.h"
-
-#include <arm_neon.h>
-
-using namespace arm_compute;
-namespace
-{
-
-/** Conditional element-wise operations */
-enum class ConditionalOperation
-{
-  PRELU, /**< (x * y) for x < 0, x for x >= 0 */
-};
-
-template <ConditionalOperation op, typename ScalarType>
-inline ScalarType elementwise_conditional_op_scalar(const ScalarType &a, const ScalarType &b)
-{
-  auto res = ScalarType(0);
-
-  switch (op)
-  {
-    case ConditionalOperation::PRELU:
-      res = a < 0 ? a * b : a;
-      break;
-    default:
-      ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
-  }
-  return res;
-}
-
-template <ConditionalOperation op>
-inline uint8_t elementwise_conditional_op_quantized_scalar(const float &a, const float &b,
-                                                           QuantizationInfo qinfo)
-{
-  return quantize_qasymm8(elementwise_conditional_op_scalar<op>(a, b), qinfo,
-                          RoundingPolicy::TO_NEAREST_UP);
-}
-
-template <ConditionalOperation op, typename VectorType>
-inline VectorType elementwise_conditional_op(const VectorType &a, const VectorType &b)
-{
-  VectorType res = {0, 0, 0, 0};
-  VectorType const_0 = {0, 0, 0, 0};
-
-  switch (op)
-  {
-    case ConditionalOperation::PRELU:
-      res = wrapper::vbsl(wrapper::vcgt(a, const_0), a, wrapper::vmul(a, b));
-      ;
-      break;
-    default:
-      ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
-  }
-  return res;
-}
-
-template <ConditionalOperation op>
-inline float32x4x4_t elementwise_conditional_op(const float32x4x4_t &a, const float32x4x4_t &b)
-{
-  float32x4x4_t out = {{
-      elementwise_conditional_op<op>(a.val[0], b.val[0]),
-      elementwise_conditional_op<op>(a.val[1], b.val[1]),
-      elementwise_conditional_op<op>(a.val[2], b.val[2]),
-      elementwise_conditional_op<op>(a.val[3], b.val[3]),
-  }};
-  return out;
-}
-
-template <ConditionalOperation op, typename ScalarType, typename VectorType>
-inline VectorType elementwise_conditional_op_broadcast(const VectorType &a,
-                                                       const ScalarType &broadcast_value,
-                                                       const bool reorder)
-{
-  VectorType broadcast_vector = wrapper::vdup_n(broadcast_value, wrapper::traits::vector_128_tag());
-  return elementwise_conditional_op<op>(reorder ? broadcast_vector : a,
-                                        reorder ? a : broadcast_vector);
-}
-
-template <ConditionalOperation op, typename ScalarType, typename VectorType>
-inline int elementwise_conditional_op_loop(int window_start_x, int window_end_x, int window_step_x,
-                                           const ScalarType *input1_ptr,
-                                           const ScalarType *input2_ptr, ScalarType *output_ptr)
-{
-  int x = window_start_x;
-  for (; x <= (window_end_x - window_step_x); x += window_step_x)
-  {
-    const auto a = wrapper::vloadq(input1_ptr + x);
-    const auto b = wrapper::vloadq(input2_ptr + x);
-    wrapper::vstore(output_ptr + x, elementwise_conditional_op<op>(a, b));
-  }
-  return x;
-}
-
-template <ConditionalOperation op>
-inline int elementwise_conditional_op_quantized_loop(int window_start_x, int window_end_x,
-                                                     int window_step_x, const uint8_t *input1_ptr,
-                                                     const uint8_t *input2_ptr, uint8_t *output_ptr,
-                                                     int32x4_t voffset1, int32x4_t voffset2,
-                                                     float32x4_t vscale1, float32x4_t vscale2,
-                                                     float32x4_t voffseto, float32x4_t invvscaleo)
-{
-  int x = window_start_x;
-  for (; x <= (window_end_x - window_step_x); x += window_step_x)
-  {
-    // Get inputs and compute output
-    const float32x4x4_t af = load_quantized(input1_ptr + x, voffset1, vscale1);
-    const float32x4x4_t bf = load_quantized(input2_ptr + x, voffset2, vscale2);
-    const float32x4x4_t rf = elementwise_conditional_op<op>(af, bf);
-    store_quantized(output_ptr + x, rf, voffseto, invvscaleo);
-  }
-  return x;
-}
-
-template <ConditionalOperation op, typename ScalarType, typename VectorType>
-inline int elementwise_conditional_op_broadcast_loop(int window_start_x, int window_end_x,
-                                                     int window_step_x,
-                                                     const ScalarType *non_broadcast_input_ptr,
-                                                     const ScalarType &broadcast_value,
-                                                     ScalarType *output_ptr, const bool reorder)
-{
-  int x = window_start_x;
-  for (; x <= (window_end_x - window_step_x); x += window_step_x)
-  {
-    const auto a = wrapper::vloadq((non_broadcast_input_ptr + x));
-    wrapper::vstore(output_ptr + x,
-                    elementwise_conditional_op_broadcast<op>(a, broadcast_value, reorder));
-  }
-  return x;
-}
-
-template <ConditionalOperation op>
-inline int elementwise_conditional_op_quantized_broadcast_loop(
-    int window_start_x, int window_end_x, int window_step_x, const uint8_t *non_broadcast_input_ptr,
-    float32x4x4_t broadcast_vector, uint8_t *output_ptr, int32x4_t voffset_non_broadcast,
-    float32x4_t vscale_non_broadcast, float32x4_t voffseto, float32x4_t invvscaleo, bool reorder)
-{
-  int x = window_start_x;
-  for (; x <= (window_end_x - window_step_x); x += window_step_x)
-  {
-    const float32x4x4_t af =
-        load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
-    const float32x4x4_t rf = elementwise_conditional_op<op>(reorder ? broadcast_vector : af,
-                                                            reorder ? af : broadcast_vector);
-    store_quantized(output_ptr + x, rf, voffseto, invvscaleo);
-  }
-  return x;
-}
-
-template <ConditionalOperation op, typename ScalarType, typename VectorType>
-void elementwise_conditional_op(const ITensor *in1, const ITensor *in2, ITensor *out,
-                                const Window &window)
-{
-  elementwise_op(in1, in2, out, window, &elementwise_conditional_op_scalar<op, ScalarType>,
-                 &elementwise_conditional_op_broadcast_loop<op, ScalarType, VectorType>,
-                 &elementwise_conditional_op_loop<op, ScalarType, VectorType>);
-}
-
-template <ConditionalOperation op>
-void elementwise_conditional_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out,
-                                          const Window &window)
-{
-  elementwise_op_quantized(in1, in2, out, window, &elementwise_conditional_op_quantized_scalar<op>,
-                           &elementwise_conditional_op_quantized_broadcast_loop<op>,
-                           &elementwise_conditional_op_quantized_loop<op>);
-}
-} // namespace
-
-NEPReLUKernel::NEPReLUKernel() : _input(nullptr), _alpha(nullptr), _output(nullptr) {}
-
-void NEPReLUKernel::configure(const ITensor *input, const ITensor *alpha, ITensor *output)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, alpha, output);
-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input->info(), *alpha->info(), *output->info()));
-
-  // Configure kernel window
-  const std::pair<TensorShape, ValidRegion> broadcast_pair =
-      ITensorInfo::broadcast_shape_and_valid_region(*input->info(), *alpha->info());
-  const TensorShape &out_shape = broadcast_pair.first;
-  const ValidRegion &valid_region = broadcast_pair.second;
-
-  // Auto initialize output if not initialized
-  auto_init_if_empty(*output->info(), out_shape, 1, input->info()->data_type());
-
-  Window win = calculate_max_window(valid_region);
-
-  _input = input;
-  _alpha = alpha;
-  _output = output;
-  INEKernel::configure(win);
-}
-
-void NEPReLUKernel::run(const Window &window, const ThreadInfo &info)
-{
-  ARM_COMPUTE_UNUSED(info);
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-  if (_input->info()->data_type() == DataType::F32)
-  {
-    elementwise_conditional_op<ConditionalOperation::PRELU, float, float32x4_t>(_input, _alpha,
-                                                                                _output, window);
-  }
-  else if (_input->info()->data_type() == DataType::QASYMM8)
-  {
-    elementwise_conditional_op_quantized<ConditionalOperation::PRELU>(_input, _alpha, _output,
-                                                                      window);
-  }
-  else
-  {
-    ARM_COMPUTE_ERROR("Wrong Type");
-  }
-}
-
-Status NEPReLUKernel::validate_arguments(const ITensorInfo &input, const ITensorInfo &alpha,
-                                         const ITensorInfo &output)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &alpha, &output);
-
-  const TensorShape out_shape =
-      TensorShape::broadcast_shape(input.tensor_shape(), alpha.tensor_shape());
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
-                                  "Inputs are not broadcast compatible");
-
-  // Checks performed when output is configured
-  if (output.total_size() > 0)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-        detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
-        "Wrong shape for output");
-  }
-
-  return Status{};
-}
-
-Status NEPReLUKernel::validate(const ITensorInfo *input, const ITensorInfo *alpha,
-                               const ITensorInfo *output)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, alpha, output);
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input, *alpha, *output));
-
-  return Status{};
-}
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp
index 6ba0f1f..5841f1d 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp
@@ -64,7 +64,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
   ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2);
   ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
   ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape().total_size() == 0);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S8);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8_SIGNED);
   ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
   ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scale_factor, 1, DataType::F16,
                                                        DataType::F32);
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NESpaceToDepthLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NESpaceToDepthLayerKernelEx.cpp
deleted file mode 100644
index 44feb20..0000000
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NESpaceToDepthLayerKernelEx.cpp
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
-#include <arm_neon.h>
-#include <cstdint>
-
-using namespace arm_compute::misc::shape_calculator;
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
-
-  ARM_COMPUTE_RETURN_ERROR_ON(block_shape < 1);
-
-  // Validate output if initialized
-  if (output->total_size() != 0)
-  {
-    const DataLayout data_layout = input->data_layout();
-    const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int idx_height =
-        get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const int idx_channel =
-        get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-    const int idx_batch =
-        get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_width] % block_shape != 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_height] % block_shape != 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_batch] !=
-                                output->tensor_shape()[idx_batch]);
-    ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_channel] % (block_shape * block_shape) !=
-                                0);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape().total_size() !=
-                                output->tensor_shape().total_size());
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-  }
-
-  return Status{};
-}
-} // namespace
-
-NESpaceToDepthLayerKernelEx::NESpaceToDepthLayerKernelEx()
-    : _input(nullptr), _output(nullptr), _block_shape()
-{
-}
-
-void NESpaceToDepthLayerKernelEx::configure(const ITensor *input, ITensor *output,
-                                            int32_t block_shape)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-  TensorShape output_shape = compute_space_to_depth_shape_ex(input->info(), block_shape);
-  auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
-
-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_shape));
-
-  _input = input;
-  _block_shape = block_shape;
-  _output = output;
-
-  // Configure kernel window
-  Window win = calculate_max_window(*output->info(), Steps());
-  INEKernel::configure(win);
-}
-
-Status NESpaceToDepthLayerKernelEx::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                             int32_t block_shape)
-{
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, block_shape));
-  return Status{};
-}
-
-void NESpaceToDepthLayerKernelEx::run(const Window &window, const ThreadInfo &info)
-{
-  ARM_COMPUTE_UNUSED(info);
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
-
-  const DataLayout data_layout = _input->info()->data_layout();
-  const int channel_idx =
-      get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-  const int element_size = _input->info()->element_size();
-
-  const size_t channel_size = _input->info()->dimension(channel_idx);
-
-  Window slice_out = window.first_slice_window_3D();
-
-  int batch_id = 0;
-
-  // Main loop for NCHW and NHWC
-  if (_output->info()->data_layout() == DataLayout::NCHW)
-  {
-    do
-    {
-      Iterator out(_output, slice_out);
-      execute_window_loop(slice_out,
-                          [&](const Coordinates &id) {
-                            const size_t channel_id = id.z();
-                            const size_t in_x =
-                                id.x() * _block_shape + (channel_id / channel_size) % _block_shape;
-                            const size_t in_y =
-                                id.y() * _block_shape + (channel_id / channel_size) / _block_shape;
-                            const int z = channel_id % channel_size;
-                            Coordinates input_coords{in_x, in_y, z, batch_id};
-                            memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
-                          },
-                          out);
-      ++batch_id;
-    } while (window.slide_window_slice_3D(slice_out));
-  }
-  else
-  {
-    do
-    {
-      Iterator out(_output, slice_out);
-      execute_window_loop(slice_out,
-                          [&](const Coordinates &id) {
-                            const size_t channel_id = id.x();
-                            const size_t in_x =
-                                id.y() * _block_shape + (channel_id / channel_size) % _block_shape;
-                            const size_t in_y =
-                                id.z() * _block_shape + (channel_id / channel_size) / _block_shape;
-                            const int z = channel_id % channel_size;
-                            Coordinates input_coords{z, in_x, in_y, batch_id};
-                            memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
-                          },
-                          out);
-      ++batch_id;
-    } while (window.slide_window_slice_3D(slice_out));
-  }
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLArgOperation.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLArgOperation.cpp
deleted file mode 100644
index 2d379cf..0000000
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLArgOperation.cpp
+++ /dev/null
@@ -1,144 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/CL/functions/CLArgOperation.h"
-
-#include "arm_compute/core/CL/kernels/CLArgOperationKernel.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-
-namespace arm_compute
-{
-
-CLArgOperation::CLArgOperation()
-{
-  // DO NOTHING
-}
-
-void CLArgOperation::configure(ICLTensor *input, ICLTensor *output, std::vector<uint32_t> axis,
-                               ArgOperation op)
-{
-  ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), axis, output->info(), op));
-  _input = input;
-  _output = output;
-  _axis = axis;
-  _arg_op = op;
-  // NOTE The argminmax_axis must have no duplication.
-  _num_of_kernels = axis.size();
-  const size_t num_of_interm_tensors = _num_of_kernels - 1;
-
-  _interm_tensors = arm_compute::support::cpp14::make_unique<CLTensor[]>(num_of_interm_tensors);
-  _argop_kernels =
-      arm_compute::support::cpp14::make_unique<CLArgOperationKernel[]>(_num_of_kernels);
-
-  TensorShape shape{input->info()->tensor_shape()};
-  for (size_t i = 0; i < num_of_interm_tensors; i++)
-  {
-    shape.set(_axis[i], 1);
-    _interm_tensors[i].allocator()->init(
-        TensorInfo(shape, input->info()->num_channels(), input->info()->data_type())
-            .set_data_layout(input->info()->data_layout()));
-    _interm_tensors[i].allocator()->allocate();
-  }
-
-  // Set a vector that is ordered ICLTensors sequentially.
-  std::vector<ICLTensor *> tensors;
-  tensors.emplace_back(input);
-  for (size_t i = 0; i < num_of_interm_tensors; i++)
-  {
-    tensors.emplace_back(_interm_tensors.get() + i);
-  }
-  tensors.emplace_back(output);
-
-  // Apply ArgMinMax on all kernels
-  for (size_t i = 0; i < _num_of_kernels; i++)
-  {
-    _argop_kernels[i].configure(tensors[i], tensors[i + 1], _axis[i], op);
-  }
-}
-
-Status CLArgOperation::validate(const ITensorInfo *input, const std::vector<uint32_t> &axis,
-                                const ITensorInfo *output, ArgOperation op)
-{
-  const size_t num_of_kernels = axis.size();
-  const size_t num_of_interm_tensors = num_of_kernels - 1;
-
-  // Create temporary tensor infos
-  auto interm_tensors =
-      arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_interm_tensors);
-
-  // Create intermediate tensor info
-  TensorShape shape{input->tensor_shape()};
-
-  for (size_t i = 0; i < num_of_interm_tensors; i++)
-  {
-    shape.set(axis[i], 1);
-    interm_tensors[i].set_data_type(input->data_type());
-    interm_tensors[i].set_tensor_shape(shape);
-    interm_tensors[i].set_num_channels(input->num_channels());
-  }
-
-  // Set a vector that is ordered ITensorInfo sequentially.
-  std::vector<const ITensorInfo *> tensors;
-  tensors.emplace_back(input);
-  for (size_t i = 0; i < num_of_interm_tensors; i++)
-  {
-    tensors.emplace_back(interm_tensors.get() + i);
-  }
-  tensors.emplace_back(output);
-
-  // Validate argminmax only on all kernels
-  for (size_t i = 0; i < num_of_kernels; i++)
-  {
-    ARM_COMPUTE_RETURN_ON_ERROR(
-        CLArgOperationKernel::validate(tensors[i], tensors[i + 1], axis[i], op));
-  }
-
-  return Status{};
-}
-
-void CLArgOperation::run()
-{
-  for (size_t i = 0; i < _num_of_kernels; ++i)
-  {
-    CLScheduler::get().enqueue(_argop_kernels[i]);
-  }
-}
-
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp
index 92ee69a..e5122ab 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp
@@ -48,7 +48,7 @@ using namespace arm_compute;
 void CLBinaryLogicalOp::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output,
                                   BinaryLogicalOperation op)
 {
-  auto k = arm_compute::support::cpp14::make_unique<CLBinaryLogicalOpKernel>();
+  auto k = support::cpp14::make_unique<CLBinaryLogicalOpKernel>();
   k->configure(input1, input2, output, op);
   _kernel = std::move(k);
 
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp
deleted file mode 100644
index b3118f3..0000000
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/CL/functions/CLCast.h"
-
-#include "arm_compute/core/CL/kernels/CLCastKernel.h"
-
-using namespace arm_compute;
-
-void CLCast::configure(ICLTensor *input, ICLTensor *output, SubDataType input_subtype)
-{
-  auto k = arm_compute::support::cpp14::make_unique<CLCastKernel>();
-  k->configure(input, output, input_subtype);
-  _kernel = std::move(k);
-}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp
deleted file mode 100644
index db66250..0000000
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/CL/functions/CLDepthToSpace.h"
-
-#include "arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h"
-
-using namespace arm_compute;
-
-void CLDepthToSpace::configure(ICLTensor *input, ICLTensor *output, const int32_t block_size)
-{
-  auto k = arm_compute::support::cpp14::make_unique<CLDepthToSpaceKernel>();
-  k->configure(input, output, block_size);
-  _kernel = std::move(k);
-}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp
new file mode 100644
index 0000000..3dede05
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp
@@ -0,0 +1,267 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/UtilsEx.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include <memory>
+#include <tuple>
+
+namespace arm_compute
+{
+using namespace arm_compute::misc::shape_calculator;
+
+CLDirectTransposeConvLayer::CLDirectTransposeConvLayer(
+    std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+    : _memory_group(std::move(memory_manager)),
+      _scale_f(),
+      _conv_f(),
+      _flip_weights(),
+      _scaled_output(),
+      _original_weights(nullptr),
+      _weights_flipped(),
+      _flip_axis(),
+      _is_prepared(false)
+{
+}
+
+Status CLDirectTransposeConvLayer::validate(const ITensorInfo *input, const ITensorInfo *weights,
+                                            const ITensorInfo *bias, ITensorInfo *output,
+                                            const PadStrideInfo &info, unsigned int invalid_right,
+                                            unsigned int invalid_bottom,
+                                            const WeightsInfo &weights_info)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+      input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
+  const DataLayout data_layout = input->data_layout();
+
+  const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+  const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+  const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) != weights->dimension(idx_h));
+  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) < 1);
+
+  auto out_dims = transposeconv_output_dimensions(
+      input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w),
+      weights->dimension(idx_h), info, invalid_right, invalid_bottom);
+
+  const TensorShape output_shape = compute_transposeconv_output_shape(out_dims, *input, *weights);
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights);
+
+  if (bias != nullptr)
+  {
+    if (is_data_type_quantized_asymmetric(input->data_type()))
+    {
+      ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+    }
+    else
+    {
+      ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+    }
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, bias);
+  }
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_w) != output_shape[idx_w],
+                                  "Output's width is invalid.");
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_h) != output_shape[idx_h],
+                                  "Output's height is invalid.");
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_c) != output_shape[idx_c],
+                                  "Output's depth is invalid.");
+
+  unsigned int pad_left = 0;
+  unsigned int pad_right = 0;
+  unsigned int pad_top = 0;
+  unsigned int pad_bottom = 0;
+  const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
+      *input, *weights, info, out_dims, invalid_right, invalid_bottom, pad_left, pad_right, pad_top,
+      pad_bottom);
+  TensorInfo scale_out_info(input->clone()
+                                ->set_is_resizable(true)
+                                .reset_padding()
+                                .set_tensor_shape(scale_out_shape)
+                                .set_data_layout(data_layout));
+  const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+
+  ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionLayerUpsample::validate(input, &scale_out_info, info));
+  ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output,
+                                                           conv_info, weights_info));
+
+  return Status{};
+}
+
+void CLDirectTransposeConvLayer::configure(ICLTensor *input, ICLTensor *weights,
+                                           const ICLTensor *bias, ICLTensor *output,
+                                           const PadStrideInfo &info, unsigned int invalid_right,
+                                           unsigned int invalid_bottom,
+                                           const WeightsInfo &weights_info)
+{
+  configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, info,
+            invalid_right, invalid_bottom, weights_info);
+}
+
+void CLDirectTransposeConvLayer::configure(const CLCompileContext &compile_context,
+                                           ICLTensor *input, ICLTensor *weights,
+                                           const ICLTensor *bias, ICLTensor *output,
+                                           const PadStrideInfo &info, unsigned int invalid_right,
+                                           unsigned int invalid_bottom,
+                                           const WeightsInfo &weights_info)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+
+  unsigned int pad_left = 0;
+  unsigned int pad_right = 0;
+  unsigned int pad_top = 0;
+  unsigned int pad_bottom = 0;
+  const unsigned int stride_x = info.stride().first;
+  const unsigned int stride_y = info.stride().second;
+
+  const DataLayout data_layout = input->info()->data_layout();
+
+  const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+  const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+  _original_weights = weights;
+  _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32));
+  _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
+  _flip_weights.configure(compile_context, weights, &_weights_flipped, &_flip_axis);
+
+  auto out_dims = transposeconv_output_dimensions(
+      input->info()->dimension(idx_w), input->info()->dimension(idx_h),
+      weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info, invalid_right,
+      invalid_bottom);
+
+  const TensorShape output_shape =
+      compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info());
+
+  // Output auto initialization if not yet initialized
+  auto_init_if_empty(
+      *output->info(),
+      input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout));
+
+  // Perform validation step
+  ARM_COMPUTE_ERROR_THROW_ON(CLDirectTransposeConvLayer::validate(
+      input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(),
+      info, invalid_right, invalid_bottom));
+
+  _is_prepared = weights_info.retain_internal_weights();
+
+  _memory_group.manage(&_scaled_output);
+
+  // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order
+  // to match output shape
+  const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
+      *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left,
+      pad_right, pad_top, pad_bottom);
+
+  TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(),
+                            input->info()->quantization_info());
+  scale_out_info.set_data_layout(data_layout);
+  _scaled_output.allocator()->init(scale_out_info);
+
+  // configure scale function
+  const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom,
+                                    DimensionRoundingType::FLOOR);
+  _scale_f.configure(input, &_scaled_output, upsample_info);
+
+  // Setup the function to convolve the upscaled output
+  const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+  _conv_f.configure(compile_context, &_scaled_output, &_weights_flipped, bias, output, conv_info,
+                    weights_info);
+  _scaled_output.allocator()->allocate();
+
+  // Setup flip axis data
+  _flip_axis.allocator()->allocate();
+  _flip_axis.map(true);
+  auto axis_data = reinterpret_cast<uint32_t *>(_flip_axis.buffer());
+  if (weights->info()->data_layout() == DataLayout::NHWC)
+  {
+    axis_data[0] = 1;
+    axis_data[1] = 2;
+  }
+  else
+  {
+    axis_data[0] = 0;
+    axis_data[1] = 1;
+  }
+  _flip_axis.unmap();
+}
+
+void CLDirectTransposeConvLayer::run()
+{
+  prepare();
+
+  MemoryGroupResourceScope scope_mg(_memory_group);
+
+  _scale_f.run();
+  _conv_f.run();
+}
+
+void CLDirectTransposeConvLayer::prepare()
+{
+  if (!_is_prepared)
+  {
+    ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+    // Run weights flipping and mark original weights tensor as unused
+    _weights_flipped.allocator()->allocate();
+    _flip_weights.run();
+    _original_weights->mark_as_unused();
+
+    // Prepare convolution
+    _conv_f.prepare();
+
+    // Free flipped weights
+    if (!_weights_flipped.is_used())
+    {
+      _weights_flipped.allocator()->free();
+    }
+
+    _is_prepared = true;
+  }
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp
index 3d9a28a..ae9d8af 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp
@@ -47,7 +47,7 @@ using namespace arm_compute;
 void CLEmbeddingLookup::configure(const ICLTensor *input, ICLTensor *output,
                                   const ICLTensor *lookups)
 {
-  auto k = arm_compute::support::cpp14::make_unique<CLEmbeddingLookupKernel>();
+  auto k = support::cpp14::make_unique<CLEmbeddingLookupKernel>();
   k->configure(input, output, lookups);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp
index f098832..0198946 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp
@@ -45,7 +45,7 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
 
 #include <algorithm>
 
@@ -60,7 +60,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
   ARM_COMPUTE_UNUSED(weights);
   ARM_COMPUTE_UNUSED(output);
   ARM_COMPUTE_RETURN_ON_ERROR(
-      CLGEMMLowpMatrixMultiplyCoreEx::validate(&input, &weights, nullptr, &output));
+      CLGEMMLowpMatrixMultiplyCore::validate(&input, &weights, nullptr, &output));
 
   return Status{};
 }
@@ -68,7 +68,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
 
 void CLFullyConnectedHybridLayerReshapeWeights::configure(const ICLTensor *input, ICLTensor *output)
 {
-  auto k = arm_compute::support::cpp14::make_unique<CLTransposeKernel>();
+  auto k = support::cpp14::make_unique<CLTransposeKernel>();
   k->configure(input, output);
   _kernel = std::move(k);
 }
@@ -172,7 +172,8 @@ void CLFullyConnectedHybridLayer::configure(const ICLTensor *input, const ICLTen
 
   // Quantize input
   _quantized_input.allocator()->init(
-      input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S8));
+      input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(
+          DataType::QASYMM8_SIGNED));
   _memory_group.manage(&_quantized_input);
   _quant_input_kernel.configure(input, &_scale_factor, &_quantized_input);
 
@@ -199,7 +200,7 @@ Status CLFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe
 {
   ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
   ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::S8);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8_SIGNED);
   ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
   ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
 
@@ -256,8 +257,9 @@ Status CLFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe
   ARM_COMPUTE_RETURN_ON_ERROR(CLScaleFactorSymm8Kernel::validate(input, &scale_factor));
 
   // Validate quantization symm8 kernel
-  const ITensorInfo &quantized_input = TensorInfo(
-      input->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S8));
+  const ITensorInfo &quantized_input =
+      TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_data_type(
+          DataType::QASYMM8_SIGNED));
   ARM_COMPUTE_RETURN_ON_ERROR(
       CLQuantizationSymmetricKernel::validate(input, &scale_factor, &quantized_input));
 
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp
index 63e291b..2ff4b96 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp
@@ -46,7 +46,7 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
 
 #include <algorithm>
 
@@ -141,7 +141,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
 
 void CLFullyConnectedLayerReshapeWeightsEx::configure(const ICLTensor *input, ICLTensor *output)
 {
-  auto k = arm_compute::support::cpp14::make_unique<CLTransposeKernel>();
+  auto k = support::cpp14::make_unique<CLTransposeKernel>();
   k->configure(input, output);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp
index 9aebc47..157b4d9 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp
@@ -53,18 +53,21 @@ void CLFullyConnectedReshapingLayer::configure(const arm_compute::ICLTensor *inp
       fc->configure(input_to_use, _weights, _biases, _output);
       return std::unique_ptr<arm_compute::IFunction>(fc);
     }
-    else
+    else if (kernel_type == KernelType::PREPROCESSED_WEIGHTS)
     {
-      assert(kernel_type == KernelType::PREPROCESSED_WEIGHTS);
-
       bool is_hybrid = (input->info()->data_type() == DataType::F32 ||
                         input->info()->data_type() == DataType::F16) &&
-                       weights->info()->data_type() == DataType::S8;
+                       (weights->info()->data_type() == DataType::S8 ||
+                        weights->info()->data_type() == DataType::QASYMM8_SIGNED);
 
       if (is_hybrid)
       {
         auto fc = new arm_compute::CLFullyConnectedHybridLayer{_memory_manager};
+        ITensorInfo *weights_info = const_cast<ITensorInfo *>(_weights->info());
+        const auto orgin_weights_data_type = weights_info->data_type();
+        weights_info->set_data_type(DataType::QASYMM8_SIGNED);
         fc->configure(input_to_use, _weights, _biases, _output);
+        weights_info->set_data_type(orgin_weights_data_type);
         return std::unique_ptr<arm_compute::IFunction>(fc);
       }
       else
@@ -74,6 +77,11 @@ void CLFullyConnectedReshapingLayer::configure(const arm_compute::ICLTensor *inp
         return std::unique_ptr<arm_compute::IFunction>(fc);
       }
     }
+    else
+    {
+      throw std::runtime_error("CLFullyConnectedReshapingLayer: Unsupported kernel type");
+    }
+
   }();
 
   if (_needs_reshape)
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.cpp
deleted file mode 100644
index ca5499d..0000000
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-
-namespace arm_compute
-{
-using namespace arm_compute::misc::shape_calculator;
-using namespace arm_compute::cl_gemm;
-
-namespace
-{
-inline bool is_gemm_reshaped(bool reshape_b_only_on_first_run, GPUTarget gpu_target)
-{
-  return (get_arch_from_target(gpu_target) != GPUTarget::MIDGARD) && (reshape_b_only_on_first_run);
-}
-} // namespace
-
-CLGEMMLowpMatrixMultiplyCoreEx::CLGEMMLowpMatrixMultiplyCoreEx(
-    std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _mm_midgard_kernel(), _mtx_a_reduction_kernel(),
-      _mtx_b_reduction_kernel(), _vector_sum_col(), _vector_sum_row(), _a_offset(0), _b_offset(0),
-      _reshape_b_only_on_first_run(false), _is_prepared(false)
-{
-}
-
-void CLGEMMLowpMatrixMultiplyCoreEx::configure(const ICLTensor *a, const ICLTensor *b,
-                                               const ICLTensor *c, ICLTensor *output,
-                                               const GEMMInfo &gemm_info)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
-  ARM_COMPUTE_UNUSED(c);
-  ARM_COMPUTE_ERROR_THROW_ON(CLGEMMLowpMatrixMultiplyCoreEx::validate(
-      a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info));
-
-  _is_prepared = false;
-  _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
-  _a_offset = a->info()->quantization_info().uniform().offset;
-  _b_offset = b->info()->quantization_info().uniform().offset;
-
-  // Get the GPU target
-  const GPUTarget gpu_target = CLScheduler::get().target();
-
-  // Set the target for the kernels
-  _mm_midgard_kernel.set_target(gpu_target);
-
-  // GEMMRHSMatrixInfo rhs_info;
-  // GEMMLHSMatrixInfo lhs_info;
-
-  // Arguments used by GEMMReshapeInfo
-  // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m,
-  // n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
-  // in order to know how the matrices have been reshaped
-  bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-  const unsigned int m = reinterpret_input_as_3d
-                             ? (a->info()->dimension(1) * a->info()->dimension(2))
-                             : a->info()->dimension(1);
-  const unsigned int n = b->info()->dimension(0);
-  const unsigned int k = a->info()->dimension(0);
-  const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
-
-  const ICLTensor *matrix_b = b;
-  // Configure matrix multiply kernel
-  _mm_midgard_kernel.configure(
-      a, matrix_b, output,
-      GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
-}
-
-Status CLGEMMLowpMatrixMultiplyCoreEx::validate(const ITensorInfo *a, const ITensorInfo *b,
-                                                const ITensorInfo *c, const ITensorInfo *output,
-                                                const GEMMInfo &gemm_info)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::S8);
-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
-  ARM_COMPUTE_UNUSED(c);
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(),
-                                  "Matrix A already reshaped is not supported");
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(),
-                                  "Matrix B already reshaped is not supported");
-
-  const ITensorInfo *matrix_a_info = a;
-
-  // Get the GPU target
-  const GPUTarget gpu_target = CLScheduler::get().target();
-
-  bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-  const unsigned int m =
-      reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-  const unsigned int n = b->dimension(0);
-  const unsigned int k = a->dimension(0);
-  const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
-
-  bool reshape_matrix_b = is_gemm_reshaped(gemm_info.reshape_b_only_on_first_run(), gpu_target);
-
-  const GEMMReshapeInfo reshape_info =
-      GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
-
-  TensorInfo weights_info(*b);
-  const ITensorInfo *matrix_b_info = &weights_info;
-  if (reshape_matrix_b)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(false,
-                                    "CLGEMMLowpMatrixMultiplyCoreEx does not support reshape_b");
-  }
-
-  // Validate matrix multiply
-  ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernelEx::validate(
-      matrix_a_info, matrix_b_info, output, reshape_info));
-
-  return Status{};
-}
-
-void CLGEMMLowpMatrixMultiplyCoreEx::run()
-{
-  prepare();
-
-  MemoryGroupResourceScope scope_mg(_memory_group);
-
-  // Run matrix multiply
-  CLScheduler::get().enqueue(_mm_midgard_kernel, false);
-}
-
-void CLGEMMLowpMatrixMultiplyCoreEx::prepare()
-{
-  if (!_is_prepared)
-  {
-    _is_prepared = true;
-  }
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp
index f594d7a..e0b833b 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp
@@ -48,7 +48,7 @@ using namespace arm_compute;
 void CLGatherEx::configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output,
                            int axis)
 {
-  auto k = arm_compute::support::cpp14::make_unique<CLGatherExKernel>();
+  auto k = support::cpp14::make_unique<CLGatherExKernel>();
   k->configure(input, indices, output, axis);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp
index 27ed8e8..65b89a3 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp
@@ -47,7 +47,7 @@ using namespace arm_compute;
 void CLHashtableLookup::configure(const ICLTensor *lookups, const ICLTensor *keys,
                                   const ICLTensor *input, ICLTensor *output, ICLTensor *hits)
 {
-  auto k = arm_compute::support::cpp14::make_unique<CLHashtableLookupKernel>();
+  auto k = support::cpp14::make_unique<CLHashtableLookupKernel>();
   k->configure(lookups, keys, input, output, hits);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp
index 80393e8..5a7e408 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp
@@ -50,7 +50,7 @@ CLInstanceNormalizationLayerEx::CLInstanceNormalizationLayerEx() {}
 void CLInstanceNormalizationLayerEx::configure(ICLTensor *input, ICLTensor *output,
                                                ICLTensor *gamma, ICLTensor *beta, float epsilon)
 {
-  auto k = arm_compute::support::cpp14::make_unique<CLInstanceNormalizationLayerKernelEx>();
+  auto k = support::cpp14::make_unique<CLInstanceNormalizationLayerKernelEx>();
   k->configure(input, output, gamma, beta, epsilon);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp
deleted file mode 100644
index fbb15ab..0000000
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/CL/functions/CLPReLU.h"
-
-#include "arm_compute/core/CL/kernels/CLPReLUKernel.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-void CLPReLU::configure(ICLTensor *input, ICLTensor *alpha, ICLTensor *output)
-{
-  auto k = arm_compute::support::cpp14::make_unique<CLPReLUKernel>();
-  k->configure(input, alpha, output);
-  _kernel = std::move(k);
-
-  if (output->info()->dimension(0) > 1)
-  {
-    ICLTensor *broadcasted_info = (input->info()->dimension(0) == 1) ? input : alpha;
-
-    if (broadcasted_info->info()->dimension(0) == 1)
-    {
-      _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
-    }
-  }
-}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLRNNLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLRNNLayerEx.cpp
deleted file mode 100644
index 6049b7e..0000000
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLRNNLayerEx.cpp
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/CL/functions/CLRNNLayerEx.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "support/ToolchainSupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-using namespace arm_compute::misc::shape_calculator;
-
-CLRNNLayerEx::CLRNNLayerEx(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(),
-      _activation_kernel(), _fully_connected_kernel(), _copy_kernel(), _fully_connected_out(),
-      _gemm_output(), _add_output(), _is_prepared(false)
-{
-}
-
-Status CLRNNLayerEx::validate(const ITensorInfo *input, const ITensorInfo *weights,
-                              const ITensorInfo *recurrent_weights, const ITensorInfo *bias,
-                              const ITensorInfo *hidden_state, const ITensorInfo *output,
-                              const ActivationLayerInfo &info)
-{
-  const int idx_width = 0;
-  const int idx_height = 1;
-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state,
-                                      output);
-  ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_width) != weights->dimension(idx_width));
-  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_height) !=
-                              recurrent_weights->dimension(idx_width));
-  ARM_COMPUTE_RETURN_ERROR_ON(recurrent_weights->dimension(idx_width) !=
-                              recurrent_weights->dimension(1));
-  ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() != 1);
-  ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(idx_width) != weights->dimension(idx_height));
-  ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_width) != weights->dimension(idx_height));
-  ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_height) != input->dimension(idx_height));
-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(),
-                                                     hidden_state->tensor_shape());
-
-  auto shape_info =
-      TensorInfo(compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1,
-                 input->data_type());
-
-  ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, weights, bias, &shape_info));
-  ARM_COMPUTE_RETURN_ON_ERROR(
-      CLGEMM::validate(hidden_state, recurrent_weights, nullptr, &shape_info, 1.f, 0.f));
-  ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(
-      ArithmeticOperation::ADD, &shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
-  ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&shape_info, &shape_info, info));
-
-  return Status{};
-}
-
-void CLRNNLayerEx::configure(const ICLTensor *input, const ICLTensor *weights,
-                             const ICLTensor *recurrent_weights, const ICLTensor *bias,
-                             ICLTensor *hidden_state, ICLTensor *output, ActivationLayerInfo &info)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
-  ARM_COMPUTE_ERROR_THROW_ON(CLRNNLayerEx::validate(input->info(), weights->info(),
-                                                    recurrent_weights->info(), bias->info(),
-                                                    hidden_state->info(), output->info(), info));
-
-  const int idx_height = 1;
-  TensorShape shape =
-      compute_rnn_shape(recurrent_weights->info(), hidden_state->info()->dimension(idx_height));
-
-  _is_prepared = false;
-
-  _fully_connected_out.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
-  _gemm_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
-
-  // Manage intermediate buffers and configure
-  _memory_group.manage(&_fully_connected_out);
-  _fully_connected_kernel.configure(input, weights, bias, &_fully_connected_out);
-
-  _memory_group.manage(&_gemm_output);
-  _gemm_state_f.configure(hidden_state, recurrent_weights, nullptr, &_gemm_output, 1.f, 0.f);
-
-  _add_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
-  _memory_group.manage(&_add_output);
-
-  _add_kernel.configure(ArithmeticOperation::ADD, &_fully_connected_out, &_gemm_output,
-                        &_add_output, ConvertPolicy::SATURATE);
-
-  _fully_connected_out.allocator()->allocate();
-  _gemm_output.allocator()->allocate();
-
-  _activation_kernel.configure(&_add_output, hidden_state, info);
-  _add_output.allocator()->allocate();
-
-  _copy_kernel.configure(hidden_state, output);
-}
-
-void CLRNNLayerEx::run()
-{
-  prepare();
-
-  _memory_group.acquire();
-
-  _fully_connected_kernel.run();
-  _gemm_state_f.run();
-  CLScheduler::get().enqueue(_add_kernel);
-  CLScheduler::get().enqueue(_activation_kernel);
-
-  // copy hidden out to output
-  CLScheduler::get().enqueue(_copy_kernel);
-
-  _memory_group.release();
-}
-
-void CLRNNLayerEx::prepare()
-{
-  if (!_is_prepared)
-  {
-    _fully_connected_kernel.prepare();
-    _gemm_state_f.prepare();
-
-    _is_prepared = true;
-  }
-}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
index 8ce2d74..a41e6db 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
@@ -60,8 +60,7 @@ Status CLReduceOperation::validate(const ITensorInfo *input, const ITensorInfo *
   const size_t num_of_interm_tensors = num_of_kernels - (keep_dims ? 1 : 0);
 
   // Create temporary tensor infos
-  auto interm_tensors =
-      arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_interm_tensors);
+  auto interm_tensors = support::cpp14::make_unique<TensorInfo[]>(num_of_interm_tensors);
 
   // Create intermediate tensor info
   TensorShape shape{input->tensor_shape()};
@@ -119,9 +118,8 @@ void CLReduceOperation::configure(ICLTensor *input, ICLTensor *output,
   const size_t num_of_kernels = axis.size();
   const size_t num_of_interm_tensors = num_of_kernels - (keep_dims ? 1 : 0);
 
-  _interm_tensors = arm_compute::support::cpp14::make_unique<CLTensor[]>(num_of_interm_tensors);
-  _reduce_kernels =
-      arm_compute::support::cpp14::make_unique<CLReduceOperationKernel[]>(num_of_kernels);
+  _interm_tensors = support::cpp14::make_unique<CLTensor[]>(num_of_interm_tensors);
+  _reduce_kernels = support::cpp14::make_unique<CLReduceOperationKernel[]>(num_of_kernels);
 
   // Set a vector that is ordered ICLTensors sequentially.
   std::vector<ICLTensor *> tensors;
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp
deleted file mode 100644
index 7d7b226..0000000
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/CL/functions/CLSpaceToDepth.h"
-
-#include "arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h"
-
-using namespace arm_compute;
-
-void CLSpaceToDepth::configure(ICLTensor *input, ICLTensor *output, const int32_t block_size)
-{
-  auto k = arm_compute::support::cpp14::make_unique<CLSpaceToDepthKernel>();
-  k->configure(input, output, block_size);
-  _kernel = std::move(k);
-}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp
index e61746e..3215d01 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp
@@ -15,7 +15,7 @@
  */
 
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,218 +37,124 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-
 #include "arm_compute/runtime/CL/functions/CLTransposeConvLayer.h"
-#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/UtilsEx.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CPP/CPPScheduler.h"
 
+#include <cmath>
 #include <memory>
 #include <tuple>
 
 using namespace arm_compute;
 using namespace arm_compute::misc::shape_calculator;
 
-CLTransposeConvLayer::CLTransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
-    : _memory_group(std::move(memory_manager)),
-      _scale_f(),
-      _conv_f(),
-      _flip_weights(),
-      _scaled_output(),
-      _original_weights(nullptr),
-      _weights_flipped(),
-      _is_prepared(false)
+CLTransposeConvLayer::CLTransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_manager(std::move(memory_manager)), _function()
+{
+}
+
+void CLTransposeConvLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias,
+                                     ICLTensor *output, const PadStrideInfo &deconv_info,
+                                     unsigned int invalid_right, unsigned int invalid_bottom,
+                                     const WeightsInfo &weights_info)
 {
+  configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, deconv_info,
+            invalid_right, invalid_bottom, weights_info);
+}
+
+void CLTransposeConvLayer::configure(const CLCompileContext &compile_context, ICLTensor *input,
+                                     ICLTensor *weights, const ICLTensor *bias, ICLTensor *output,
+                                     const PadStrideInfo &deconv_info, unsigned int invalid_right,
+                                     unsigned int invalid_bottom, const WeightsInfo &weights_info)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+
+  switch (CLTransposeConvLayer::get_deconvolution_method(input->info(), weights->info(), nullptr,
+                                                         output->info(), deconv_info, invalid_right,
+                                                         invalid_bottom, weights_info))
+  {
+    case DeconvolutionMethod::DIRECT:
+    {
+      auto f = arm_compute::support::cpp14::make_unique<CLDirectTransposeConvLayer>();
+      f->configure(compile_context, input, weights, bias, output, deconv_info, invalid_right,
+                   invalid_bottom, weights_info);
+      _function = std::move(f);
+      break;
+    }
+    case DeconvolutionMethod::GEMM:
+    {
+      auto f = arm_compute::support::cpp14::make_unique<CLGEMMDeconvolutionLayer>(_memory_manager);
+      f->configure(compile_context, input, weights, bias, output, deconv_info);
+      _function = std::move(f);
+      break;
+    }
+    default:
+      ARM_COMPUTE_ERROR("Not supported.");
+      break;
+  }
 }
 
 Status CLTransposeConvLayer::validate(const ITensorInfo *input, const ITensorInfo *weights,
                                       const ITensorInfo *bias, ITensorInfo *output,
-                                      const PadStrideInfo &info, unsigned int invalid_right,
+                                      const PadStrideInfo &deconv_info, unsigned int invalid_right,
                                       unsigned int invalid_bottom, const WeightsInfo &weights_info)
 {
   ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
-                                                       DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
-
-  const DataLayout data_layout = input->data_layout();
-
-  const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-  const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-  const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-
-  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) != weights->dimension(idx_h));
-  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) < 1);
-
-  const unsigned int kernel_x = weights->dimension(idx_w);
-  const unsigned int kernel_y = weights->dimension(idx_h);
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(invalid_right > kernel_x - 1,
-                                  "invalid_right must be smaller than kernel_x");
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(invalid_bottom > kernel_y - 1,
-                                  "inner_border_top must be smaller than kernel_y");
-
-  // NOTE From the existing CLDeconvolutionLayer, invalid_right and invalid_bottom were added.
-  auto out_dims = transposeconv_output_dimensions(
-      input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w),
-      weights->dimension(idx_h), info, invalid_right, invalid_bottom);
-
-  const TensorShape output_shape = compute_transposeconv_output_shape(out_dims, *input, *weights);
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights);
-
-  if (bias != nullptr)
+  switch (CLTransposeConvLayer::get_deconvolution_method(
+      input, weights, bias, output, deconv_info, invalid_right, invalid_bottom, weights_info))
   {
-    if (is_data_type_quantized_asymmetric(input->data_type()))
+    case DeconvolutionMethod::DIRECT:
     {
-      ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+      // Validate direct convolution layer
+      ARM_COMPUTE_RETURN_ON_ERROR(CLDirectTransposeConvLayer::validate(
+          input, weights, bias, output, deconv_info, invalid_right, invalid_bottom, weights_info));
+      break;
     }
-    else
+    case DeconvolutionMethod::GEMM:
     {
-      ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+      // Validate gemm-based convolution layer
+      ARM_COMPUTE_RETURN_ON_ERROR(
+          CLGEMMDeconvolutionLayer::validate(input, weights, bias, output, deconv_info));
+      break;
     }
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, bias);
+    default:
+      ARM_COMPUTE_ERROR("Not supported.");
+      break;
   }
 
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_w) != output_shape[idx_w],
-                                  "Output's width is invalid.");
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_h) != output_shape[idx_h],
-                                  "Output's height is invalid.");
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_c) != output_shape[idx_c],
-                                  "Output's depth is invalid.");
-
-  unsigned int pad_left = 0;
-  unsigned int pad_right = 0;
-  unsigned int pad_top = 0;
-  unsigned int pad_bottom = 0;
-  const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
-      *input, *weights, info, out_dims, invalid_right, invalid_bottom, pad_left, pad_right, pad_top,
-      pad_bottom);
-  TensorInfo scale_out_info(input->clone()
-                                ->set_is_resizable(true)
-                                .reset_padding()
-                                .set_tensor_shape(scale_out_shape)
-                                .set_data_layout(data_layout));
-  const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
-
-  ARM_COMPUTE_RETURN_ON_ERROR(
-      CLTransposeConvLayerUpsample::validate(input, &scale_out_info, BorderSize(0, 0), info));
-  ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output,
-                                                           conv_info, weights_info));
-
   return Status{};
 }
 
-void CLTransposeConvLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias,
-                                     ICLTensor *output, const PadStrideInfo &info,
-                                     unsigned int invalid_right, unsigned int invalid_bottom,
-                                     const WeightsInfo &weights_info)
+DeconvolutionMethod CLTransposeConvLayer::get_deconvolution_method(
+    const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias,
+    ITensorInfo *output, const PadStrideInfo &deconv_info, unsigned int invalid_right,
+    unsigned int invalid_bottom, const WeightsInfo &weights_info)
 {
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-
-  const unsigned int stride_x = info.stride().first;
-  const unsigned int stride_y = info.stride().second;
+  ARM_COMPUTE_UNUSED(output, bias, weights_info);
 
-  const DataLayout data_layout = input->info()->data_layout();
+  const DataLayout data_layout = input->data_layout();
 
   const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
   const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
 
-  _original_weights = weights;
-  _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
-  _flip_weights.configure(weights, &_weights_flipped);
-
-  // NOTE From the existing CLDeconvolutionLayer, invalid_right and invalid_bottom were
-  // added.
-  auto out_dims = transposeconv_output_dimensions(
-      input->info()->dimension(idx_w), input->info()->dimension(idx_h),
-      weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info, invalid_right,
-      invalid_bottom);
-
-  const TensorShape output_shape =
-      compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info());
-
-  // Output auto initialization if not yet initialized
-  auto_init_if_empty(
-      *output->info(),
-      input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout));
-
-  // Perform validation step
-  ARM_COMPUTE_ERROR_THROW_ON(CLTransposeConvLayer::validate(
-      input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(),
-      info, invalid_right, invalid_bottom));
-
-  _is_prepared = weights_info.retain_internal_weights();
-
-  _memory_group.manage(&_scaled_output);
-
-  // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order
-  // to match output shape
-  unsigned int pad_left = 0;
-  unsigned int pad_right = 0;
-  unsigned int pad_top = 0;
-  unsigned int pad_bottom = 0;
-  const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
-      *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left,
-      pad_right, pad_top, pad_bottom);
-
-  TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(),
-                            input->info()->quantization_info());
-  scale_out_info.set_data_layout(data_layout);
-  _scaled_output.allocator()->init(scale_out_info);
-
-  // configure scale function
-  const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom,
-                                    DimensionRoundingType::FLOOR);
-  _scale_f.configure(input, &_scaled_output, BorderSize(0, 0), upsample_info);
-
-  // setup the function to convolve the upscaled output
-  const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
-  _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info, weights_info);
-  _scaled_output.allocator()->allocate();
+  if (weights->dimension(idx_w) != deconv_info.stride().first ||
+      weights->dimension(idx_h) != deconv_info.stride().second || invalid_right != 0 ||
+      invalid_bottom != 0)
+  {
+    return DeconvolutionMethod::DIRECT;
+  }
+
+  return DeconvolutionMethod::GEMM;
 }
 
 void CLTransposeConvLayer::run()
 {
   prepare();
-
-  _memory_group.acquire();
-
-  _scale_f.run();
-  _conv_f.run();
-
-  _memory_group.release();
+  _function->run();
 }
 
-void CLTransposeConvLayer::prepare()
-{
-  if (!_is_prepared)
-  {
-    ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
-    // Run weights flipping and mark original weights tensor as unused
-    _weights_flipped.allocator()->allocate();
-    _weights_flipped.map(true);
-    _original_weights->map(CLScheduler::get().queue(), true);
-    CPPScheduler::get().schedule(&_flip_weights, Window::DimZ);
-    _weights_flipped.unmap();
-    _original_weights->unmap(CLScheduler::get().queue());
-    _original_weights->mark_as_unused();
-
-    // Prepare convolution
-    _conv_f.prepare();
-
-    if (!_weights_flipped.is_used())
-    {
-      _weights_flipped.allocator()->free();
-    }
-
-    _is_prepared = true;
-  }
-}
+void CLTransposeConvLayer::prepare() { _function->prepare(); }
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayerUpsample.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayerUpsample.cpp
deleted file mode 100644
index 07feb5a..0000000
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayerUpsample.cpp
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h"
-
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-#include <cmath>
-#include <memory>
-#include <tuple>
-
-using namespace arm_compute;
-
-CLTransposeConvLayerUpsample::CLTransposeConvLayerUpsample() // NOLINT
-    : _upsample(),
-      _output(nullptr)
-{
-}
-
-Status CLTransposeConvLayerUpsample::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                              const BorderSize &inner_border,
-                                              const PadStrideInfo &info)
-{
-  return CLTransposeConvLayerUpsampleKernel::validate(input, output, inner_border, info);
-}
-
-void CLTransposeConvLayerUpsample::configure(ICLTensor *input, ICLTensor *output,
-                                             const BorderSize &inner_border,
-                                             const PadStrideInfo &info)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-  _output = output;
-  _upsample.configure(input, _output, inner_border, info);
-}
-
-void CLTransposeConvLayerUpsample::run()
-{
-  _output->map(CLScheduler::get().queue(), true);
-  if (is_data_type_quantized_asymmetric(_output->info()->data_type()))
-  {
-    const uint8_t quantized_zero = _output->info()->quantization_info().uniform().offset;
-    std::fill_n(_output->buffer(), _output->info()->total_size(), quantized_zero);
-  }
-  else
-  {
-    memset(_output->buffer(), 0, _output->info()->total_size());
-  }
-  _output->unmap(CLScheduler::get().queue());
-
-  CLScheduler::get().enqueue(_upsample, false);
-}
diff --git a/compute/ARMComputeEx/src/runtime/CPP/functions/CPPOneHotEx.cpp b/compute/ARMComputeEx/src/runtime/CPP/functions/CPPOneHotEx.cpp
index 114e1a7..768c15b 100644
--- a/compute/ARMComputeEx/src/runtime/CPP/functions/CPPOneHotEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/CPP/functions/CPPOneHotEx.cpp
@@ -41,14 +41,14 @@
 #include "arm_compute/runtime/CPP/functions/CPPOneHotEx.h"
 
 #include "arm_compute/core/CPP/kernels/CPPOneHotKernelEx.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
 
 using namespace arm_compute;
 
 void CPPOneHotEx::configure(const ITensor *indices, const ITensor *depth, const ITensor *on_value,
                             const ITensor *off_value, ITensor *output, const int axis)
 {
-  auto k = arm_compute::support::cpp14::make_unique<CPPOneHotKernelEx>();
+  auto k = support::cpp14::make_unique<CPPOneHotKernelEx>();
   k->configure(indices, depth, on_value, off_value, output, axis);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/CPP/functions/CPPUpsampleEx.cpp b/compute/ARMComputeEx/src/runtime/CPP/functions/CPPUpsampleEx.cpp
deleted file mode 100644
index 6c90ef3..0000000
--- a/compute/ARMComputeEx/src/runtime/CPP/functions/CPPUpsampleEx.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/CPP/functions/CPPUpsampleEx.h"
-
-#include "arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h"
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute;
-
-void CPPUpsampleEx::configure(const ITensor *input, ITensor *output, const PadStrideInfo &info)
-{
-  auto k = arm_compute::support::cpp14::make_unique<CPPUpsampleKernelEx>();
-  k->configure(input, output, info);
-  _kernel = std::move(k);
-}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEActivationLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEActivationLayerEx.cpp
index ff81ff8..2752eb6 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEActivationLayerEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEActivationLayerEx.cpp
@@ -42,7 +42,7 @@
 
 #include "arm_compute/core/NEON/kernels/NEActivationLayerKernelEx.h"
 #include "arm_compute/runtime/IRuntimeContext.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
@@ -53,7 +53,7 @@ NEActivationLayerEx::NEActivationLayerEx(IRuntimeContext *ctx) // NOLINT
 void NEActivationLayerEx::configure(ITensor *input, ITensor *output,
                                     ActivationLayerInfo activation_info)
 {
-  auto k = arm_compute::support::cpp14::make_unique<NEActivationLayerKernelEx>();
+  auto k = support::cpp14::make_unique<NEActivationLayerKernelEx>();
   k->configure(input, output, activation_info);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp
index e42c453..2fc94b2 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp
@@ -42,7 +42,7 @@
 #include <arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h>
 
 #include "arm_compute/core/ITensor.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
 
 #include <utility>
 
@@ -53,7 +53,7 @@ template <BinaryLogicalOperation COP>
 void NEBinaryLogicalOperationStatic<COP>::configure(ITensor *input1, ITensor *input2,
                                                     ITensor *output)
 {
-  auto k = arm_compute::support::cpp14::make_unique<NEBinaryLogicalOperationKernel>();
+  auto k = support::cpp14::make_unique<NEBinaryLogicalOperationKernel>();
   k->configure(COP, input1, input2, output);
   _kernel = std::move(k);
 }
@@ -69,7 +69,7 @@ Status NEBinaryLogicalOperationStatic<COP>::validate(const ITensorInfo *input1,
 void NEBinaryLogicalOperation::configure(ITensor *input1, ITensor *input2, ITensor *output,
                                          BinaryLogicalOperation op)
 {
-  auto k = arm_compute::support::cpp14::make_unique<NEBinaryLogicalOperationKernel>();
+  auto k = support::cpp14::make_unique<NEBinaryLogicalOperationKernel>();
   k->configure(op, input1, input2, output);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NECast.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NECast.cpp
deleted file mode 100644
index dc5c620..0000000
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NECast.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/NEON/functions/NECast.h"
-
-#include "arm_compute/core/NEON/kernels/NECastKernel.h"
-#include "support/ToolchainSupport.h"
-
-namespace arm_compute
-{
-void NECast::configure(const ITensor *input, ITensor *output, SubDataType input_subtype)
-{
-  auto k = arm_compute::support::cpp14::make_unique<NECastKernel>();
-  k->configure(input, output, input_subtype);
-  _kernel = std::move(k);
-}
-
-Status NECast::validate(const ITensorInfo *input, const ITensorInfo *output,
-                        SubDataType input_subtype)
-{
-  return NECastKernel::validate(input, output, input_subtype);
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEDepthToSpaceLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEDepthToSpaceLayerEx.cpp
deleted file mode 100644
index 5ec0b86..0000000
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEDepthToSpaceLayerEx.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-
-namespace arm_compute
-{
-void NEDepthToSpaceLayerEx::configure(const ITensor *input, ITensor *output, int32_t block_shape)
-{
-  auto k = arm_compute::support::cpp14::make_unique<NEDepthToSpaceLayerKernelEx>();
-  k->configure(input, output, block_shape);
-  _kernel = std::move(k);
-}
-
-Status NEDepthToSpaceLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                       int32_t block_shape)
-{
-  return NEDepthToSpaceLayerKernelEx::validate(input, output, block_shape);
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp
index 53fb150..e0ab3e0 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp
@@ -41,13 +41,13 @@
 #include "arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h"
 
 #include "arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
 
 using namespace arm_compute;
 
 void NEEmbeddingLookup::configure(const ITensor *input, ITensor *output, const ITensor *lookups)
 {
-  auto k = arm_compute::support::cpp14::make_unique<NEEmbeddingLookupKernel>();
+  auto k = support::cpp14::make_unique<NEEmbeddingLookupKernel>();
   k->configure(input, output, lookups);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp
index f457732..a123439 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp
@@ -58,7 +58,7 @@ namespace
 Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output)
 {
   ARM_COMPUTE_RETURN_ON_ERROR(
-      NEGEMMLowpMatrixMultiplyCoreEx::validate(&input, &weights, nullptr, &output));
+      NEGEMMLowpMatrixMultiplyCore::validate(&input, &weights, nullptr, &output));
 
   return Status{};
 }
@@ -66,7 +66,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
 
 void NEFullyConnectedHybridLayerReshapeWeights::configure(const ITensor *input, ITensor *output)
 {
-  auto k = arm_compute::support::cpp14::make_unique<NETransposeKernel>();
+  auto k = support::cpp14::make_unique<NETransposeKernel>();
   k->configure(input, output);
   _kernel = std::move(k);
 }
@@ -158,7 +158,8 @@ void NEFullyConnectedHybridLayer::configure(const ITensor *input, const ITensor
 
   // Quantize input
   _quantized_input.allocator()->init(
-      input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S8));
+      input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(
+          DataType::QASYMM8_SIGNED));
   _scale_factor.allocator()->init(
       TensorInfo(TensorShape{output->info()->dimension(1)}, 1, DataType::F32));
   _quant_input_kernel.configure(input, &_quantized_input, &_scale_factor);
@@ -186,7 +187,7 @@ Status NEFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe
   ARM_COMPUTE_UNUSED(fc_info.retain_internal_weights);
   ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
   ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::S8);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8_SIGNED);
   ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
   ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
   ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 2);
@@ -224,8 +225,9 @@ Status NEFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe
   ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1));
 
   // Validate quantization kernel
-  const ITensorInfo &quantized_input = TensorInfo(
-      input->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S8));
+  const ITensorInfo &quantized_input =
+      TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_data_type(
+          DataType::QASYMM8_SIGNED));
   const ITensorInfo &scale_factor = TensorInfo(TensorShape{output->dimension(1)}, 1, DataType::F32);
   ARM_COMPUTE_RETURN_ON_ERROR(
       NEQuantizationSymmetricKernel::validate(input, &quantized_input, &scale_factor));
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp
index fcac3c7..dc6c784 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp
@@ -56,12 +56,17 @@ void NEFullyConnectedReshapingLayer::configure(const arm_compute::ITensor *input
       assert(kernel_type == KernelType::PREPROCESSED_WEIGHTS);
 
       bool is_hybrid = input->info()->data_type() == DataType::F32 &&
-                       weights->info()->data_type() == DataType::S8;
+                       (weights->info()->data_type() == DataType::S8 ||
+                        weights->info()->data_type() == DataType::QASYMM8_SIGNED);
 
       if (is_hybrid)
       {
         auto fc = new arm_compute::NEFullyConnectedHybridLayer{_memory_manager};
+        ITensorInfo *weights_info = const_cast<ITensorInfo *>(_weights->info());
+        const auto orgin_weights_data_type = weights_info->data_type();
+        weights_info->set_data_type(DataType::QASYMM8_SIGNED);
         fc->configure(input_to_use, _weights, _biases, _output);
+        weights_info->set_data_type(orgin_weights_data_type);
         return std::unique_ptr<arm_compute::IFunction>(fc);
       }
       else
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.cpp
deleted file mode 100644
index 1290cfd..0000000
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.cpp
+++ /dev/null
@@ -1,513 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute;
-using namespace arm_compute::misc::shape_calculator;
-
-NEGEMMLowpMatrixMultiplyCoreEx::NEGEMMLowpMatrixMultiplyCoreEx(
-    std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(memory_manager), _asm_glue(memory_manager), _mm_kernel(nullptr),
-      _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _mtx_a_reduction_kernel(),
-      _mtx_b_reduction_kernel(), _offset_contribution_kernel(),
-      _offset_contribution_output_stage_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(),
-      _tmp_b(), _mm_result_s32(), _signed_a(), _signed_output(), _original_b(nullptr), _a_offset(0),
-      _b_offset(0), _run_vector_matrix_multiplication(false), _assembly_path(false),
-      _fused_assembly_path(false), _reshape_b_only_on_first_run(false), _is_prepared(false),
-      _fuse_output_stage(false), _flip_signedness(false)
-{
-}
-
-void NEGEMMLowpMatrixMultiplyCoreEx::configure(const ITensor *a, const ITensor *b, const ITensor *c,
-                                               ITensor *output, const GEMMInfo &gemm_info)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
-  ARM_COMPUTE_UNUSED(c);
-  ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpMatrixMultiplyCoreEx::validate(
-      a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info));
-
-  const ITensor *matrix_a = a;
-  const ITensor *matrix_b = b;
-  GEMMInfo info = gemm_info;
-
-  // Clear state
-  _mtx_a_reshape_kernel = nullptr;
-  _mtx_b_reshape_kernel = nullptr;
-
-  // Set internal variables
-  _a_offset = a->info()->quantization_info().uniform().offset;
-  _b_offset = b->info()->quantization_info().uniform().offset;
-  _run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
-  _reshape_b_only_on_first_run = info.reshape_b_only_on_first_run();
-  _is_prepared = false;
-  _fused_assembly_path = false;
-  _original_b = b;
-
-  const ITensor *a_to_use = a;
-
-  // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
-  if (info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
-  {
-    _fuse_output_stage = true;
-    _memory_group.manage(&_mm_result_s32);
-    TensorInfo info_mm_result_s32(output->info()->tensor_shape(), 1, DataType::S32);
-    _mm_result_s32.allocator()->init(info_mm_result_s32);
-  }
-
-#ifdef __aarch64__
-  switch (a->info()->data_type())
-  {
-    case DataType::QASYMM8:
-    case DataType::QASYMM8_SIGNED:
-    case DataType::U8:
-    case DataType::S8:
-    {
-      if (a_to_use->info()->data_type() == DataType::QASYMM8 &&
-          info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
-      {
-        _asm_glue.configure(a_to_use, b, c, output, gemm_info);
-        _fused_assembly_path = _asm_glue.is_configured();
-      }
-      else
-      {
-        _asm_glue.configure(a_to_use, b, nullptr, _fuse_output_stage ? &_mm_result_s32 : output,
-                            gemm_info);
-      }
-      _assembly_path = _asm_glue.is_configured();
-      break;
-    }
-    default:
-    {
-      ARM_COMPUTE_ERROR("Datatype not supported");
-      break;
-    }
-  }
-#endif /* __aarch64__ */
-  if (!(_assembly_path || _run_vector_matrix_multiplication))
-  {
-    matrix_a = &_tmp_a;
-    matrix_b = &_tmp_b;
-
-    // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width /
-    // 4.0f) ]
-    TensorInfo a_info(compute_interleaved_shape(*a_to_use->info()), 1,
-                      a_to_use->info()->data_type(), a_to_use->info()->quantization_info());
-    // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width /
-    // 16.0f) ]
-    TensorInfo b_info(compute_transpose1xW_shape(*b->info()), 1, b->info()->data_type(),
-                      b->info()->quantization_info());
-    _tmp_a.allocator()->init(a_info);
-    _tmp_b.allocator()->init(b_info);
-    _memory_group.manage(&_tmp_a);
-    if (!_reshape_b_only_on_first_run)
-    {
-      _memory_group.manage(&_tmp_b);
-    }
-
-    // Configure interleave kernel
-    {
-      auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
-      k->configure(a_to_use, &_tmp_a);
-      _mtx_a_reshape_kernel = std::move(k);
-    }
-
-    // Configure transpose kernel
-    {
-      auto k = arm_compute::support::cpp14::make_unique<NEGEMMTranspose1xWKernel>();
-      k->configure(b, &_tmp_b);
-      _mtx_b_reshape_kernel = std::move(k);
-    }
-  }
-
-  if (!_fused_assembly_path)
-  {
-    // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
-    if (_a_offset != 0)
-    {
-      TensorInfo info_vector_sum_col(compute_reductionA_shape(*b->info()), 1, DataType::S32);
-
-      _vector_sum_col.allocator()->init(info_vector_sum_col);
-      if (!_reshape_b_only_on_first_run)
-      {
-        _memory_group.manage(&_vector_sum_col);
-      }
-
-      // Configure Matrix B reduction kernel
-      _mtx_b_reduction_kernel.configure(b, &_vector_sum_col, a_to_use->info()->dimension(0), false);
-    }
-
-    // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
-    if (_b_offset != 0)
-    {
-      TensorInfo info_vector_sum_row(compute_reductionB_shape(*a_to_use->info()), 1, DataType::S32);
-
-      _vector_sum_row.allocator()->init(info_vector_sum_row);
-      _memory_group.manage(&_vector_sum_row);
-
-      // Configure matrix A reduction kernel
-      _mtx_a_reduction_kernel.configure(a_to_use, &_vector_sum_row, a_to_use->info()->dimension(0),
-                                        false);
-    }
-
-    if (_fuse_output_stage)
-    {
-      // Configure matrix multiply kernel
-      if (!_assembly_path)
-      {
-        auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
-        k->configure(matrix_a, matrix_b, &_mm_result_s32);
-        _mm_kernel = std::move(k);
-      }
-
-      _offset_contribution_output_stage_kernel.configure(
-          &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col,
-          _b_offset == 0 ? nullptr : &_vector_sum_row, c,
-          _flip_signedness ? &_signed_output : output, a->info()->dimension(0), _a_offset,
-          _b_offset, info.gemmlowp_output_stage());
-    }
-    else
-    {
-      // Configure matrix multiply kernel
-      if (!_assembly_path)
-      {
-        auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
-        k->configure(matrix_a, matrix_b, output);
-        _mm_kernel = std::move(k);
-      }
-      // Configure offset contribution kernel
-      _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col,
-                                            _b_offset == 0 ? nullptr : &_vector_sum_row,
-                                            a_to_use->info()->dimension(0), _a_offset, _b_offset);
-    }
-  }
-
-  // Allocate tensors
-  if (!_assembly_path && !_run_vector_matrix_multiplication)
-  {
-    _tmp_a.allocator()->allocate();
-    if (!_reshape_b_only_on_first_run)
-    {
-      _tmp_b.allocator()->allocate();
-    }
-  }
-
-  if (!_fused_assembly_path)
-  {
-    if (_a_offset != 0 && !_reshape_b_only_on_first_run)
-    {
-      _vector_sum_col.allocator()->allocate();
-    }
-
-    if (_b_offset != 0)
-    {
-      _vector_sum_row.allocator()->allocate();
-    }
-  }
-
-  if (_fuse_output_stage)
-  {
-    _mm_result_s32.allocator()->allocate();
-  }
-}
-
-Status NEGEMMLowpMatrixMultiplyCoreEx::validate(const ITensorInfo *a, const ITensorInfo *b,
-                                                const ITensorInfo *c, const ITensorInfo *output,
-                                                const GEMMInfo &gemm_info)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::S8);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::S8);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-      c != nullptr && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE,
-      "Bias addition not supported in NEGEMMLowpMatrixMultiplyCoreEx for output S32");
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1),
-                                  "The product AB is defined only if the number of columns in A is "
-                                  "equal to the number of rows in B");
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(),
-                                  "Matrix A already reshaped is not supported");
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(),
-                                  "Matrix B already reshaped is not supported");
-
-  GEMMInfo info = gemm_info;
-  const ITensorInfo *matrix_a_info = a;
-  const ITensorInfo *matrix_b_info = b;
-
-  const ITensorInfo *a_to_use = a;
-
-  TensorInfo tmp_a_info{};
-  TensorInfo tmp_b_info{};
-  TensorInfo mm_result_s32_info{};
-
-  int32_t a_offset = a->quantization_info().uniform().offset;
-  int32_t b_offset = b->quantization_info().uniform().offset;
-
-  bool fuse_output_stage = info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE;
-  if (fuse_output_stage)
-  {
-    auto_init_if_empty(
-        mm_result_s32_info,
-        a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32));
-  }
-
-  // Check if we need to run the optimized assembly kernel
-  bool run_optimised = false;
-  bool run_optimised_requantized = false;
-  if (a_to_use->data_type() == DataType::QASYMM8 &&
-      info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
-  {
-    run_optimised = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, c, output, gemm_info));
-    run_optimised_requantized = run_optimised;
-  }
-  else
-  {
-    run_optimised = bool(NEGEMMAssemblyDispatch::validate(
-        a_to_use, b, c, fuse_output_stage ? &mm_result_s32_info : output, gemm_info));
-  }
-
-  if (run_optimised)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0));
-    if (info.depth_output_gemm3d() != 0)
-    {
-      if (info.reinterpret_input_as_3d())
-      {
-        ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
-        ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2));
-      }
-      else
-      {
-        ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1) * output->dimension(2));
-      }
-    }
-    else
-    {
-      ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
-    }
-  }
-  else
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(),
-                                    "NEGEMM cannot reinterpret the input tensor as 3D");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0,
-                                    "NEGEMM cannot reinterpret the output tensor as 3D");
-
-    const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
-    if (!run_vector_matrix_multiplication)
-    {
-      matrix_a_info = &tmp_a_info;
-      matrix_b_info = &tmp_b_info;
-
-      // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width /
-      // 4.0f) ]
-      TensorShape shape_tmp_a = a->tensor_shape();
-      shape_tmp_a.set(0, a->dimension(0) * 4);
-      shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));
-
-      // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width
-      // / 16.0f) ]
-      TensorShape shape_tmp_b = b->tensor_shape();
-      shape_tmp_b.set(0, b->dimension(1) * 16);
-      shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
-
-      // Validate interleave kernel
-      auto_init_if_empty(tmp_a_info, a_to_use->clone()->set_tensor_shape(shape_tmp_a));
-      auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(shape_tmp_b));
-
-      ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a_to_use, &tmp_a_info));
-      ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &tmp_b_info));
-    }
-  }
-
-  if (!run_optimised_requantized)
-  {
-    TensorInfo info_vector_sum_col{};
-    TensorInfo info_vector_sum_row{};
-
-    // Validate matrix B reduction kernel only if _a_offset is not equal to 0
-    if (a_offset != 0)
-    {
-      info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
-
-      // Configure Matrix B reduction kernel
-      ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixBReductionKernel::validate(
-          b, &info_vector_sum_col, a->dimension(0), false));
-    }
-
-    // Validate Matrix A reduction kernel only if _b_offset is not equal to 0
-    if (b_offset != 0)
-    {
-      info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
-
-      // Configure matrix A reduction kernel
-      ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(
-          a_to_use, &info_vector_sum_row, a->dimension(0), false));
-    }
-
-    if (fuse_output_stage)
-    {
-      if (!run_optimised)
-      {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(
-            matrix_a_info, matrix_b_info, &mm_result_s32_info));
-      }
-
-      // Validate offset contribution kernel
-      ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionOutputStageKernel::validate(
-          &mm_result_s32_info, a_offset == 0 ? nullptr : &info_vector_sum_col,
-          b_offset == 0 ? nullptr : &info_vector_sum_row, c, output, a_offset, b_offset,
-          info.gemmlowp_output_stage()));
-    }
-    else
-    {
-      if (!run_optimised)
-      {
-        ARM_COMPUTE_RETURN_ON_ERROR(
-            NEGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output));
-      }
-      // Validate offset contribution kernel
-      ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionKernel::validate(
-          output, a_offset == 0 ? nullptr : &info_vector_sum_col,
-          b_offset == 0 ? nullptr : &info_vector_sum_row, a_offset, b_offset));
-    }
-  }
-  return Status{};
-}
-
-void NEGEMMLowpMatrixMultiplyCoreEx::run()
-{
-  prepare();
-
-  MemoryGroupResourceScope scope_mg(_memory_group);
-
-  // Reshape inputs
-  if (_mtx_a_reshape_kernel)
-  {
-    NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY);
-  }
-  if (_mtx_b_reshape_kernel && !_reshape_b_only_on_first_run)
-  {
-    NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
-  }
-
-  // Run GEMM
-  if (_asm_glue.is_configured())
-  {
-    _asm_glue.run();
-  }
-  else
-  {
-    NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);
-  }
-
-  if (!_fused_assembly_path)
-  {
-    // Run matrix A reduction kernel only if _b_offset is not equal to 0
-    if (_b_offset != 0)
-    {
-      NEScheduler::get().schedule(&_mtx_a_reduction_kernel, Window::DimX);
-    }
-
-    // Run matrix B reduction kernel only if _a_offset is not equal to 0
-    if (_a_offset != 0 && !_reshape_b_only_on_first_run)
-    {
-      NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);
-    }
-
-    if (_fuse_output_stage)
-    {
-      // Run offset contribution kernel
-      NEScheduler::get().schedule(&_offset_contribution_output_stage_kernel, Window::DimY);
-    }
-    else
-    {
-      // Run offset contribution kernel
-      NEScheduler::get().schedule(&_offset_contribution_kernel, Window::DimY);
-    }
-  }
-}
-
-void NEGEMMLowpMatrixMultiplyCoreEx::prepare()
-{
-  if (!_is_prepared)
-  {
-    // Run assembly reshape
-    if (_asm_glue.is_configured() && _reshape_b_only_on_first_run)
-    {
-      ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
-
-      _asm_glue.prepare();
-      _original_b->mark_as_unused();
-    }
-    // Run non-assembly reshape
-    else if (_mtx_b_reshape_kernel && _reshape_b_only_on_first_run)
-    {
-      ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
-
-      // Run reshape kernel and mark original weights tensor as unused
-      _tmp_b.allocator()->allocate();
-      NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
-      _original_b->mark_as_unused();
-    }
-
-    // Run matrix B reduction kernel only if _a_offset is not equal to 0
-    if (_a_offset != 0 && _reshape_b_only_on_first_run)
-    {
-      _vector_sum_col.allocator()->allocate();
-      NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);
-    }
-
-    _is_prepared = true;
-  }
-}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp
index c8bb88a..433c35d 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp
@@ -41,7 +41,7 @@
 #include "arm_compute/runtime/NEON/functions/NEGatherEx.h"
 
 #include "arm_compute/core/NEON/kernels/NEGatherKernelEx.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
 
 #include <utility>
 
@@ -49,7 +49,7 @@ namespace arm_compute
 {
 void NEGatherEx::configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis)
 {
-  auto k = arm_compute::support::cpp14::make_unique<NEGatherKernelEx>();
+  auto k = support::cpp14::make_unique<NEGatherKernelEx>();
   k->configure(input, indices, output, axis);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp
index 078019f..52d58ac 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp
@@ -41,14 +41,14 @@
 #include "arm_compute/runtime/NEON/functions/NEHashtableLookup.h"
 
 #include "arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
 
 using namespace arm_compute;
 
 void NEHashtableLookup::configure(const ITensor *lookups, const ITensor *keys, const ITensor *input,
                                   ITensor *output, ITensor *hits)
 {
-  auto k = arm_compute::support::cpp14::make_unique<NEHashtableLookupKernel>();
+  auto k = support::cpp14::make_unique<NEHashtableLookupKernel>();
   k->configure(lookups, keys, input, output, hits);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEPReLU.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEPReLU.cpp
deleted file mode 100644
index dac3b84..0000000
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEPReLU.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/NEON/functions/NEPReLU.h"
-
-#include "arm_compute/core/NEON/kernels/NEPReLUKernel.h"
-#include "support/ToolchainSupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void NEPReLU::configure(const ITensor *input, const ITensor *alpha, ITensor *output)
-{
-  auto k = arm_compute::support::cpp14::make_unique<NEPReLUKernel>();
-  k->configure(input, alpha, output);
-  _kernel = std::move(k);
-}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NERNNLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NERNNLayerEx.cpp
deleted file mode 100644
index 0e9a5e9..0000000
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NERNNLayerEx.cpp
+++ /dev/null
@@ -1,161 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/NEON/functions/NERNNLayerEx.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
-namespace arm_compute
-{
-NERNNLayerEx::NERNNLayerEx(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(),
-      _activation_kernel(), _fully_connected_kernel(), _copy_kernel(), _fully_connected_out(),
-      _gemm_output(), _add_output(), _is_prepared(false)
-{
-}
-
-Status NERNNLayerEx::validate(const ITensorInfo *input, const ITensorInfo *weights,
-                              const ITensorInfo *recurrent_weights, const ITensorInfo *bias,
-                              const ITensorInfo *hidden_state, const ITensorInfo *output,
-                              const ActivationLayerInfo &info)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state,
-                                      output);
-
-  const int idx_width = 0;
-  const int idx_height = 1;
-  ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_width) != weights->dimension(idx_width));
-  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_height) !=
-                              recurrent_weights->dimension(idx_width));
-  ARM_COMPUTE_RETURN_ERROR_ON(recurrent_weights->dimension(idx_width) !=
-                              recurrent_weights->dimension(idx_height));
-  ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() != 1);
-  ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(idx_width) != weights->dimension(idx_height));
-  ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_width) != weights->dimension(idx_height));
-  ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_height) != input->dimension(idx_height));
-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(),
-                                                     hidden_state->tensor_shape());
-
-  auto shape_info = TensorInfo(misc::shape_calculator::compute_rnn_shape(
-                                   recurrent_weights, hidden_state->dimension(idx_height)),
-                               1, input->data_type());
-
-  ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, weights, bias, &shape_info));
-  ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(
-      &shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
-  ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayerKernel::validate(&shape_info, &shape_info, info));
-
-  return Status{};
-}
-
-void NERNNLayerEx::configure(const ITensor *input, const ITensor *weights,
-                             const ITensor *recurrent_weights, const ITensor *bias,
-                             ITensor *hidden_state, ITensor *output, ActivationLayerInfo &info)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
-  ARM_COMPUTE_ERROR_THROW_ON(NERNNLayerEx::validate(input->info(), weights->info(),
-                                                    recurrent_weights->info(), bias->info(),
-                                                    hidden_state->info(), output->info(), info));
-
-  const int idx_height = 1;
-  TensorShape shape = misc::shape_calculator::compute_rnn_shape(
-      recurrent_weights->info(), hidden_state->info()->dimension(idx_height));
-
-  _is_prepared = false;
-
-  // Manage intermediate buffers and configure
-  _fully_connected_out.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
-  _gemm_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
-
-  // Manage intermediate buffers and configure
-  _memory_group.manage(&_fully_connected_out);
-  _fully_connected_kernel.configure(input, weights, bias, &_fully_connected_out);
-
-  _memory_group.manage(&_gemm_output);
-  _gemm_state_f.configure(hidden_state, recurrent_weights, nullptr, &_gemm_output, 1.f, 0.f);
-
-  _add_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
-  _memory_group.manage(&_add_output);
-
-  _add_kernel.configure(&_fully_connected_out, &_gemm_output, &_add_output,
-                        ConvertPolicy::SATURATE);
-
-  _fully_connected_out.allocator()->allocate();
-  _gemm_output.allocator()->allocate();
-
-  _activation_kernel.configure(&_add_output, hidden_state, info);
-  _add_output.allocator()->allocate();
-
-  _copy_kernel.configure(hidden_state, output);
-}
-
-void NERNNLayerEx::run()
-{
-  prepare();
-
-  MemoryGroupResourceScope scope_mg(_memory_group);
-
-  _fully_connected_kernel.run();
-
-  _gemm_state_f.run();
-
-  NEScheduler::get().schedule(&_add_kernel, Window::DimY);
-  NEScheduler::get().schedule(&_activation_kernel, Window::DimY);
-
-  // copy hidden out to output
-  NEScheduler::get().schedule(&_copy_kernel, Window::DimY);
-}
-
-void NERNNLayerEx::prepare()
-{
-  if (!_is_prepared)
-  {
-    _fully_connected_kernel.prepare();
-    _gemm_state_f.prepare();
-
-    _is_prepared = true;
-  }
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceMeanEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceMeanEx.cpp
deleted file mode 100644
index 116bba3..0000000
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceMeanEx.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/NEON/functions/NEReduceMeanEx.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
-using namespace arm_compute;
-
-NEReduceMeanEx::NEReduceMeanEx(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(),
-      _reduction_ops(), _keep_dims()
-{
-}
-
-Status NEReduceMeanEx::validate(const ITensorInfo *input, const Coordinates &reduction_axis,
-                                bool keep_dims, const ITensorInfo *output)
-{
-  ARM_COMPUTE_UNUSED(keep_dims);
-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
-  ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
-
-  TensorShape out_shape = input->tensor_shape();
-  const unsigned int reduction_ops = reduction_axis.num_dimensions();
-  const int input_dims = input->num_dimensions();
-  Coordinates axis_local = reduction_axis;
-
-  // Convert negative axis
-  for (unsigned int i = 0; i < reduction_ops; ++i)
-  {
-    axis_local[i] = wrap_around(axis_local[i], input_dims);
-  }
-
-  std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
-  for (unsigned int i = 0; i < reduction_ops; ++i)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3);
-    ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) >
-                                input->num_dimensions() - 1);
-    if (output->total_size() > 0 && keep_dims)
-    {
-      ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1);
-    }
-    if (keep_dims)
-    {
-      out_shape.set(axis_local[i], 1);
-    }
-    else
-    {
-      out_shape.remove_dimension(axis_local[i] - i);
-    }
-  }
-  const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
-
-  return Status{};
-}
-
-void NEReduceMeanEx::configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims,
-                               ITensor *output)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input);
-
-  _reduction_ops = reduction_axis.num_dimensions();
-  _reduction_kernels =
-      arm_compute::support::cpp14::make_unique<NEReductionOperation[]>(_reduction_ops);
-  _reduced_outs =
-      arm_compute::support::cpp14::make_unique<Tensor[]>(_reduction_ops - (keep_dims ? 1 : 0));
-  _keep_dims = keep_dims;
-
-  Coordinates axis_local = reduction_axis;
-  const int input_dims = input->info()->num_dimensions();
-  const unsigned int reduction_ops = reduction_axis.num_dimensions();
-
-  // Convert negative axis
-  for (unsigned int i = 0; i < reduction_ops; ++i)
-  {
-    axis_local[i] = wrap_around(axis_local[i], input_dims);
-  }
-
-  // Perform reduction for every axis
-  for (unsigned int i = 0; i < _reduction_ops; ++i)
-  {
-    TensorShape out_shape = i == 0 ? input->info()->tensor_shape()
-                                   : (_reduced_outs.get() + i - 1)->info()->tensor_shape();
-    out_shape.set(axis_local[i], 1);
-    auto in = (i == 0) ? input : (_reduced_outs.get() + i - 1);
-
-    if (i == _reduction_ops - 1 && keep_dims)
-    {
-      _reduction_kernels[i].configure(in, output, axis_local[i], ReductionOperation::MEAN_SUM);
-    }
-    else
-    {
-      _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(),
-                                                    input->info()->data_type(),
-                                                    input->info()->quantization_info())
-                                             .set_data_layout(output->info()->data_layout()));
-      _memory_group.manage(_reduced_outs.get() + i);
-      _reduction_kernels[i].configure(in, _reduced_outs.get() + i, axis_local[i],
-                                      ReductionOperation::MEAN_SUM);
-    }
-  }
-
-  // Allocate intermediate tensors
-  for (unsigned int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
-  {
-    _reduced_outs[i].allocator()->allocate();
-  }
-
-  // Configure reshape layer if we want to drop the dimensions
-  if (!keep_dims)
-  {
-    TensorShape out_shape = input->info()->tensor_shape();
-
-    // We have to sort the reduction axis vectors in order for remove_dimension
-    // to work properly
-    std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops);
-    for (unsigned int i = 0; i < _reduction_ops; ++i)
-    {
-      out_shape.remove_dimension(axis_local[i] - i);
-    }
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape));
-    _reshape.configure(_reduced_outs.get() + _reduction_ops - 1, output);
-  }
-}
-
-void NEReduceMeanEx::run()
-{
-  _memory_group.acquire();
-
-  for (unsigned int i = 0; i < _reduction_ops; ++i)
-  {
-    _reduction_kernels[i].run();
-  }
-
-  if (!_keep_dims)
-  {
-    _reshape.run();
-  }
-  _memory_group.release();
-}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToBatchLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToBatchLayerEx.cpp
deleted file mode 100644
index 198bb76..0000000
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToBatchLayerEx.cpp
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
-namespace arm_compute
-{
-NESpaceToBatchLayerEx::NESpaceToBatchLayerEx()
-    : _space_to_batch_kernel(), _memset_kernel(), _has_padding(false)
-{
-}
-
-void NESpaceToBatchLayerEx::configure(const ITensor *input, const ITensor *block_shape,
-                                      const ITensor *paddings, ITensor *output)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output);
-
-  if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
-  {
-    _has_padding = true;
-    _memset_kernel.configure(
-        output, PixelValue(0, output->info()->data_type(), output->info()->quantization_info()));
-  }
-  _space_to_batch_kernel.configure(input, block_shape, paddings, output);
-}
-
-void NESpaceToBatchLayerEx::configure(const ITensor *input, const int block_shape_x,
-                                      const int block_shape_y, const Size2D &padding_left,
-                                      const Size2D &padding_right, ITensor *output)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-  if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
-  {
-    _has_padding = true;
-    _memset_kernel.configure(
-        output, PixelValue(0, output->info()->data_type(), output->info()->quantization_info()));
-  }
-  _space_to_batch_kernel.configure(input, block_shape_x, block_shape_y, padding_left, padding_right,
-                                   output);
-}
-
-Status NESpaceToBatchLayerEx::validate(const ITensorInfo *input, const ITensorInfo *block_shape,
-                                       const ITensorInfo *paddings, const ITensorInfo *output)
-{
-  ARM_COMPUTE_RETURN_ON_ERROR(
-      NESpaceToBatchLayerKernel::validate(input, block_shape, paddings, output));
-
-  return Status{};
-}
-
-Status NESpaceToBatchLayerEx::validate(const ITensorInfo *input, const int block_shape_x,
-                                       const int block_shape_y, const Size2D &padding_left,
-                                       const Size2D &padding_right, const ITensorInfo *output)
-{
-  ARM_COMPUTE_RETURN_ON_ERROR(NESpaceToBatchLayerKernel::validate(
-      input, block_shape_x, block_shape_y, padding_left, padding_right, output));
-
-  return Status{};
-}
-
-void NESpaceToBatchLayerEx::run()
-{
-  // Zero out output only if we have paddings
-  if (_has_padding)
-  {
-    NEScheduler::get().schedule(&_memset_kernel, Window::DimY);
-  }
-  NEScheduler::get().schedule(&_space_to_batch_kernel, Window::DimY);
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToDepthLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToDepthLayerEx.cpp
deleted file mode 100644
index 97697e3..0000000
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToDepthLayerEx.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-
-namespace arm_compute
-{
-void NESpaceToDepthLayerEx::configure(const ITensor *input, ITensor *output, int32_t block_shape)
-{
-  auto k = arm_compute::support::cpp14::make_unique<NESpaceToDepthLayerKernelEx>();
-  k->configure(input, output, block_shape);
-  _kernel = std::move(k);
-}
-
-Status NESpaceToDepthLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                       int32_t block_shape)
-{
-  ARM_COMPUTE_RETURN_ON_ERROR(NESpaceToDepthLayerKernelEx::validate(input, output, block_shape));
-  return Status{};
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp
index df06892..09f1780 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp
@@ -1,21 +1,5 @@
 /*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,14 +21,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-
 #include "arm_compute/runtime/NEON/functions/NETransposeConvLayer.h"
 
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Utils.h"
 #include "arm_compute/core/UtilsEx.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
@@ -52,20 +33,15 @@ using namespace arm_compute::misc::shape_calculator;
 
 namespace arm_compute
 {
+
 NETransposeConvLayer::NETransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
     : _memory_group(std::move(memory_manager)),
       _conv_f(),
       _upsample_f(),
       _flip_weights(),
-      _permute_input(),
-      _permute_weights(),
-      _permute_output(),
       _scaled_output(),
       _weights_flipped(),
-      _permuted_input(),
-      _permuted_weights(),
-      _permuted_output(),
-      _is_nchw(false),
+      _flip_axis(),
       _original_weights(nullptr),
       _input(nullptr),
       _info(),
@@ -80,7 +56,7 @@ Status NETransposeConvLayer::validate(const ITensorInfo *input, const ITensorInf
 {
   ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
   ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16,
-                                                       DataType::QASYMM8);
+                                                       DataType::QASYMM8, DataType::QASYMM8_SIGNED);
   ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, input);
   ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(weights, input);
   const unsigned int width_idx =
@@ -95,13 +71,16 @@ Status NETransposeConvLayer::validate(const ITensorInfo *input, const ITensorInf
       weights->dimension(height_idx), info, invalid_right, invalid_bottom);
 
   ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-  if (is_data_type_quantized_asymmetric(input->data_type()) && bias)
+  if (bias != nullptr)
   {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
-  }
-  else if (bias)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+    if (is_data_type_quantized_asymmetric(input->data_type()))
+    {
+      ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+    }
+    else
+    {
+      ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+    }
   }
 
   if (output->tensor_shape().total_size() > 0)
@@ -110,12 +89,12 @@ Status NETransposeConvLayer::validate(const ITensorInfo *input, const ITensorInf
 
     const TensorShape output_shape = compute_transposeconv_output_shape(out_dims, *input, *weights);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) < output_shape.x(),
-                                    "Output's dim 0 is invalid.");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimY) < output_shape.y(),
-                                    "Output's dim 1 is invalid.");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) < output_shape.z(),
-                                    "Output's dim 2 is invalid.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) != output_shape.x(),
+                                    "Output's width is invalid.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimY) != output_shape.y(),
+                                    "Output's height is invalid.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) != output_shape.z(),
+                                    "Output's depth is invalid.");
   }
 
   unsigned int pad_left = 0;
@@ -127,7 +106,6 @@ Status NETransposeConvLayer::validate(const ITensorInfo *input, const ITensorInf
       pad_bottom);
   TensorInfo scale_out_info(
       input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape));
-  scale_out_info.set_data_layout(input->data_layout());
   const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
 
   const unsigned int batches_idx =
@@ -149,19 +127,13 @@ void NETransposeConvLayer::configure(ITensor *input, const ITensor *weights, con
                                      ITensor *output, const PadStrideInfo &info,
                                      unsigned int invalid_right, unsigned int invalid_bottom)
 {
+  // Perform validation step
   ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+  ARM_COMPUTE_ERROR_THROW_ON(NETransposeConvLayer::validate(
+      input->info(), weights->info(), (bias == nullptr) ? nullptr : bias->info(), output->info(),
+      info, invalid_right, invalid_bottom));
 
   const DataLayout data_layout = input->info()->data_layout();
-
-  _input = input;
-  _original_weights = weights;
-  _info = info;
-  _is_prepared = false;
-  _is_nchw = data_layout == DataLayout::NCHW;
-
-  const unsigned int stride_x = info.stride().first;
-  const unsigned int stride_y = info.stride().second;
-
   const unsigned int width_idx =
       get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
   const unsigned int height_idx =
@@ -173,101 +145,54 @@ void NETransposeConvLayer::configure(ITensor *input, const ITensor *weights, con
 
   const TensorShape output_shape =
       compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info());
+
+  _input = input;
+  _original_weights = weights;
+  _info = info;
+  _is_prepared = false;
+
+  unsigned int pad_left = 0;
+  unsigned int pad_right = 0;
+  unsigned int pad_top = 0;
+  unsigned int pad_bottom = 0;
+  const unsigned int stride_x = info.stride().first;
+  const unsigned int stride_y = info.stride().second;
+
   // Output auto initialization if not yet initialized
   auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(),
                      input->info()->quantization_info());
 
-  // Perform validation step
-  ARM_COMPUTE_ERROR_THROW_ON(NETransposeConvLayer::validate(
-      input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(),
-      info, invalid_right, invalid_bottom));
-
+  _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32));
   _memory_group.manage(&_scaled_output);
 
-  if (!_is_nchw)
-  {
-    _memory_group.manage(&_permuted_input);
-    _memory_group.manage(&_permuted_weights);
-    _memory_group.manage(&_permuted_output);
-
-    // Configure the function to transform the input tensor from NHWC -> NCHW
-    _permuted_input.info()->set_quantization_info(input->info()->quantization_info());
-    _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
-    _permuted_input.info()->set_data_layout(DataLayout::NCHW);
-
-    // Configure the function to transform the weights tensor from NHWC -> NCHW
-    _permuted_weights.info()->set_quantization_info(weights->info()->quantization_info());
-    _permute_weights.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
-    _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
-
-    // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in
-    // order to match output shape
-
-    unsigned int pad_left = 0;
-    unsigned int pad_right = 0;
-    unsigned int pad_top = 0;
-    unsigned int pad_bottom = 0;
-    const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
-        *_permuted_input.info(), *_permuted_weights.info(), info, out_dims, invalid_right,
-        invalid_bottom, pad_left, pad_right, pad_top, pad_bottom);
-
-    TensorInfo scale_out_info(scale_out_shape, 1, _permuted_input.info()->data_type(),
-                              _permuted_input.info()->quantization_info());
-    scale_out_info.set_data_layout(DataLayout::NCHW);
-    _scaled_output.allocator()->init(scale_out_info);
-
-    const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom,
-                                      DimensionRoundingType::CEIL);
-    _upsample_f.configure(&_permuted_input, &_scaled_output, upsample_info);
-
-    _weights_flipped.allocator()->init(*_permuted_weights.info()->clone());
-    _weights_flipped.info()->set_quantization_info(weights->info()->quantization_info());
-    _flip_weights.configure(&_permuted_weights, &_weights_flipped);
-
-    // setup the function to convolve the upscaled output
-    const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
-
-    const auto out_shape = output->info()->tensor_shape();
-    TensorShape permuted_out_shape{out_shape[1], out_shape[2], out_shape[0], out_shape[3]};
-    TensorInfo permuted_out_info(permuted_out_shape, 1, output->info()->data_type(),
-                                 output->info()->quantization_info());
-    _permuted_output.allocator()->init(permuted_out_info);
-    _permuted_output.info()->set_data_layout(DataLayout::NCHW);
-    _conv_f.configure(&_scaled_output, &_weights_flipped, bias, &_permuted_output, conv_info);
-
-    // Configure the function to transform the convoluted output to NHWC
-    _permute_output.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U));
-
-    _permuted_input.allocator()->allocate();
-    _permuted_weights.allocator()->allocate();
-    _permuted_output.allocator()->allocate();
-  }
-  else
-  {
-    // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in
-    // order to match output shape
-    unsigned int pad_left = 0;
-    unsigned int pad_right = 0;
-    unsigned int pad_top = 0;
-    unsigned int pad_bottom = 0;
-    const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
-        *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left,
-        pad_right, pad_top, pad_bottom);
-
-    TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(),
-                              input->info()->quantization_info());
-    _scaled_output.allocator()->init(scale_out_info);
-    const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom,
-                                      DimensionRoundingType::FLOOR);
-    _upsample_f.configure(input, &_scaled_output, upsample_info);
-
-    _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
-    _flip_weights.configure(weights, &_weights_flipped);
-
-    // setup the function to convolve the upscaled output
-    const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
-    _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info);
-  }
+  _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
+  _flip_weights.configure(weights, &_weights_flipped, &_flip_axis);
+
+  // setup the function to convolve the upscaled output
+  const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+
+  const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
+      *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left,
+      pad_right, pad_top, pad_bottom);
+
+  const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom,
+                                    DimensionRoundingType::FLOOR);
+
+  TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(),
+                            input->info()->quantization_info());
+  scale_out_info.set_data_layout(data_layout);
+  _scaled_output.allocator()->init(scale_out_info);
+
+  _upsample_f.configure(input, &_scaled_output, upsample_info);
+
+  _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info);
+
+  // Setup flip axis data
+  _flip_axis.allocator()->allocate();
+  auto axis_data = reinterpret_cast<uint32_t *>(_flip_axis.buffer());
+  axis_data[0] = static_cast<uint32_t>(width_idx);
+  axis_data[1] = static_cast<uint32_t>(height_idx);
+
   _scaled_output.allocator()->allocate();
 }
 
@@ -275,22 +200,10 @@ void NETransposeConvLayer::run()
 {
   prepare();
 
-  // MemoryGroupResourceScope scope_mg(_memory_group);
-
-  // Permute input
-  if (!_is_nchw)
-  {
-    _permute_input.run();
-  }
+  MemoryGroupResourceScope scope_mg(_memory_group);
 
   _upsample_f.run();
   _conv_f.run();
-
-  // Permute output
-  if (!_is_nchw)
-  {
-    _permute_output.run();
-  }
 }
 
 void NETransposeConvLayer::prepare()
@@ -301,22 +214,12 @@ void NETransposeConvLayer::prepare()
 
     // Run weights flipping and mark original weights tensor as unused
     _weights_flipped.allocator()->allocate();
-    // Permute weights
-    if (!_is_nchw)
-    {
-      _permute_weights.run();
-    }
-    NEScheduler::get().schedule(&_flip_weights, Window::DimZ);
+    _flip_weights.run();
     _original_weights->mark_as_unused();
 
     // Prepare convolution
     _conv_f.prepare();
 
-    if (!_weights_flipped.is_used())
-    {
-      _weights_flipped.allocator()->free();
-    }
-
     _is_prepared = true;
   }
 }
diff --git a/compute/cker/CMakeLists.txt b/compute/cker/CMakeLists.txt
index 09f6725..609dd45 100644
--- a/compute/cker/CMakeLists.txt
+++ b/compute/cker/CMakeLists.txt
@@ -8,6 +8,9 @@ target_link_libraries(nnfw_lib_cker INTERFACE gemmlowp)
 target_link_libraries(nnfw_lib_cker INTERFACE ruy)
 target_link_libraries(nnfw_lib_cker INTERFACE ruy_instrumentation)
 target_compile_definitions(nnfw_lib_cker INTERFACE USE_RUY_GEMV)
+if(EXPERIMENTAL_RUY_FEATURE)
+  target_compile_definitions(nnfw_lib_cker INTERFACE EXPERIMENTAL_RUY_FEATURE)
+endif(EXPERIMENTAL_RUY_FEATURE)
 if(PROFILE_RUY)
   target_link_libraries(nnfw_lib_cker INTERFACE ruy_profiler)
 endif(PROFILE_RUY)
diff --git a/compute/cker/include/cker/Types.h b/compute/cker/include/cker/Types.h
index 41b1916..1bde640 100644
--- a/compute/cker/include/cker/Types.h
+++ b/compute/cker/include/cker/Types.h
@@ -259,6 +259,12 @@ struct FullyConnectedParams
   // FullyConnectedWeightsFormat weights_format;
 };
 
+struct L2NormParams
+{
+  // uint8 inference params.
+  int32_t input_zero_point;
+};
+
 struct GatherParams
 {
   int32_t axis;
@@ -338,6 +344,11 @@ struct SpaceToBatchParams
   int32_t output_offset;
 };
 
+struct SpaceToDepthParams
+{
+  int32_t block_size;
+};
+
 enum class Order
 {
   kColMajor,
diff --git a/compute/cker/include/cker/Utils.h b/compute/cker/include/cker/Utils.h
index b69d55c..2abb998 100644
--- a/compute/cker/include/cker/Utils.h
+++ b/compute/cker/include/cker/Utils.h
@@ -123,6 +123,68 @@ inline int CountLeadingZeros(uint32_t integer_input)
   return leading_zeros;
 }
 
+inline void GetInvSqrtQuantizedMultiplierExp(int32_t input, int reverse_shift,
+                                             int32_t *output_inv_sqrt, int *output_shift)
+{
+  assert(input >= 0);
+  if (input <= 1)
+  {
+    // Handle the input value 1 separately to avoid overflow in that case
+    // in the general computation below (b/143972021). Also handle 0 as if it
+    // were a 1. 0 is an invalid input here (divide by zero) and 1 is a valid
+    // but rare/unrealistic input value. We can expect both to occur in some
+    // incompletely trained models, but probably not in fully trained models.
+    *output_inv_sqrt = std::numeric_limits<std::int32_t>::max();
+    *output_shift = 0;
+    return;
+  }
+  assert(input > 1);
+  *output_shift = 11;
+  while (input >= (1 << 29))
+  {
+    input /= 4;
+    ++*output_shift;
+  }
+  const unsigned max_left_shift_bits = CountLeadingZeros(static_cast<uint32_t>(input)) - 1;
+  const unsigned max_left_shift_bit_pairs = max_left_shift_bits / 2;
+  const unsigned left_shift_bit_pairs = max_left_shift_bit_pairs - 1;
+  *output_shift -= left_shift_bit_pairs;
+  input <<= 2 * left_shift_bit_pairs;
+  assert(input >= (1 << 27));
+  assert(input < (1 << 29));
+  using gemmlowp::FixedPoint;
+  using gemmlowp::Rescale;
+  using gemmlowp::SaturatingRoundingMultiplyByPOT;
+  // Using 3 integer bits gives us enough room for the internal arithmetic in
+  // this Newton-Raphson iteration.
+  using F3 = FixedPoint<int32_t, 3>;
+  using F0 = FixedPoint<int32_t, 0>;
+  const F3 fixedpoint_input = F3::FromRaw(input >> 1);
+  const F3 fixedpoint_half_input = SaturatingRoundingMultiplyByPOT<-1>(fixedpoint_input);
+  const F3 fixedpoint_half_three =
+      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F3, (1 << 28) + (1 << 27), 1.5);
+  // Newton-Raphson iteration
+  // Naive unoptimized starting guess: x = 1
+  F3 x = F3::One();
+  // Naive unoptimized number of iterations: 5
+  for (int i = 0; i < 5; i++)
+  {
+    const F3 x3 = Rescale<3>(x * x * x);
+    x = Rescale<3>(fixedpoint_half_three * x - fixedpoint_half_input * x3);
+  }
+  const F0 fixedpoint_half_sqrt_2 =
+      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F0, 1518500250, std::sqrt(2.) / 2.);
+  x = x * fixedpoint_half_sqrt_2;
+  *output_inv_sqrt = x.raw();
+  if (*output_shift < 0)
+  {
+    *output_inv_sqrt <<= -*output_shift;
+    *output_shift = 0;
+  }
+  // Convert right shift (right is positive) to left shift.
+  *output_shift *= reverse_shift;
+}
+
 // Comment from tensorflow lite:
 //
 // DO NOT USE THIS STRUCT FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING
diff --git a/compute/cker/include/cker/operation/FullyConnected.h b/compute/cker/include/cker/operation/FullyConnected.h
index 9bcf3fd..9b72811 100644
--- a/compute/cker/include/cker/operation/FullyConnected.h
+++ b/compute/cker/include/cker/operation/FullyConnected.h
@@ -78,8 +78,11 @@ inline void FullyConnected(const FullyConnectedParams &params, const Shape &inpu
   MatrixBatchVectorMultiplyAccumulate(weights_data, num_units, input_size, input_data, batch_size,
                                       output_data, /*result_stride=*/1);
 
-  // Apply activation function
-  ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data);
+  if (params.activation != FusedActivationFunctionType::kNone)
+  {
+    // Apply activation function
+    ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data);
+  }
 }
 
 inline void FullyConnected(const FullyConnectedParams &params, const Shape &input_shape,
@@ -195,7 +198,11 @@ inline void FullyConnectedHybrid(const FullyConnectedParams &params, const Shape
 #endif
 
   // Apply activation function to floats.
-  ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data);
+  if (params.activation != FusedActivationFunctionType::kNone)
+  {
+    // Apply activation function
+    ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data);
+  }
   return;
 }
 
diff --git a/compute/cker/include/cker/operation/L2Normalize.h b/compute/cker/include/cker/operation/L2Normalize.h
new file mode 100644
index 0000000..a0075c3
--- /dev/null
+++ b/compute/cker/include/cker/operation/L2Normalize.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_L2NORMALIZE_H__
+#define __NNFW_CKER_L2NORMALIZE_H__
+
+#include "cker/Shape.h"
+#include "cker/Utils.h"
+#include "cker/Types.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+void L2NormalizeFloat32(const Shape &input_shape, const float *input_data,
+                        const Shape &output_shape, float *output_data)
+{
+  float epsilon = 1e-6;
+  const int trailing_dim = input_shape.DimensionsCount() - 1;
+  const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+  const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+  for (int i = 0; i < outer_size; ++i)
+  {
+    float squared_l2_norm = 0;
+    for (int c = 0; c < depth; ++c)
+    {
+      const float val = input_data[c];
+      squared_l2_norm += val * val;
+    }
+    float l2_norm = std::sqrt(squared_l2_norm);
+    l2_norm = std::max(l2_norm, epsilon);
+    for (int c = 0; c < depth; ++c)
+    {
+      *output_data = *input_data / l2_norm;
+      ++output_data;
+      ++input_data;
+    }
+  }
+}
+
+void L2NormalizeQuant8(L2NormParams &params, const Shape &input_shape, const uint8_t *input_data,
+                       const Shape &output_shape, uint8_t *output_data)
+{
+  const int trailing_dim = input_shape.DimensionsCount() - 1;
+  const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+  const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+  const int32_t input_zero_point = params.input_zero_point;
+
+  for (int i = 0; i < outer_size; ++i)
+  {
+    int32_t square_l2_norm = 0;
+    for (int c = 0; c < depth; c++)
+    {
+      // Note that input_data advances by depth in the second pass below.
+      int32_t diff = input_data[c] - input_zero_point;
+      square_l2_norm += diff * diff;
+    }
+    int32_t inv_l2norm_multiplier;
+    int inv_l2norm_shift;
+    GetInvSqrtQuantizedMultiplierExp(square_l2_norm, -1, &inv_l2norm_multiplier, &inv_l2norm_shift);
+    for (int c = 0; c < depth; c++)
+    {
+      int32_t diff = *input_data - input_zero_point;
+      int32_t rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+          128 * diff, inv_l2norm_multiplier, inv_l2norm_shift);
+      int32_t unclamped_output_val = 128 + rescaled_diff;
+      int32_t output_val = std::min(static_cast<int32_t>(255),
+                                    std::max(static_cast<int32_t>(0), unclamped_output_val));
+      *output_data = static_cast<uint8_t>(output_val);
+      ++input_data;
+      ++output_data;
+    }
+  }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_L2NORMALIZE_H__
diff --git a/compute/cker/include/cker/operation/Logistic.h b/compute/cker/include/cker/operation/Logistic.h
index 7477858..3d3e59e 100644
--- a/compute/cker/include/cker/operation/Logistic.h
+++ b/compute/cker/include/cker/operation/Logistic.h
@@ -32,18 +32,9 @@ namespace cker
 inline void Logistic(const Shape &input_shape, const float *input_data, const Shape &output_shape,
                      float *output_data)
 {
-#ifdef __aarch64__
   auto input_map = MapAsVector(input_data, input_shape);
   auto output_map = MapAsVector(output_data, output_shape);
   output_map.array() = input_map.array().unaryExpr(Eigen::internal::scalar_logistic_op<float>());
-#else
-  // Note, this can be done using TANH: (1/2) + (1/2) * TANH(x/2)
-  const int size = MatchingFlatSize(input_shape, output_shape);
-  for (int i = 0; i < size; i++)
-  {
-    output_data[i] = 1.f / (1.f + std::exp(-input_data[i]));
-  }
-#endif
 }
 
 } // namespace cker
diff --git a/compute/cker/include/cker/operation/Pad.h b/compute/cker/include/cker/operation/Pad.h
index af432f3..4a2732d 100644
--- a/compute/cker/include/cker/operation/Pad.h
+++ b/compute/cker/include/cker/operation/Pad.h
@@ -26,9 +26,10 @@ namespace nnfw
 {
 namespace cker
 {
+template <typename T>
 inline void Pad(const int32_t *padding_data, int32_t pad_rank, const Shape &input_shape,
-                const float *input_data, const Shape &output_shape, float *output_data,
-                const float *constant_value_data)
+                const T *input_data, const Shape &output_shape, T *output_data,
+                const T *constant_value_data)
 {
   // Note, this is pad with mode=`CONSTANT`: it doesn't support `REFLECT` and `SYMMETRIC`
   // TODO: come up with more subtle solution that uses subtensors like arm compute
@@ -38,7 +39,7 @@ inline void Pad(const int32_t *padding_data, int32_t pad_rank, const Shape &inpu
   /** List of padding information */
   using PaddingList = std::vector<PaddingInfo>;
 
-  auto constant_value = constant_value_data ? *constant_value_data : 0;
+  const T constant_value = constant_value_data ? *constant_value_data : 0;
   assert(output_shape.DimensionsCount() == input_shape.DimensionsCount());
 
   PaddingList padding_list(pad_rank);
@@ -64,7 +65,7 @@ inline void Pad(const int32_t *padding_data, int32_t pad_rank, const Shape &inpu
     {
       const int32_t in_row_len = input_shape.Dims(0);
       std::fill_n(output_data, padding_list[0].first, constant_value);
-      std::memcpy(output_data + padding_list[0].first, input_data, in_row_len * sizeof(float));
+      std::memcpy(output_data + padding_list[0].first, input_data, in_row_len * sizeof(T));
       std::fill_n(output_data + padding_list[0].first + in_row_len, padding_list[0].second,
                   constant_value);
       break;
@@ -89,7 +90,7 @@ inline void Pad(const int32_t *padding_data, int32_t pad_rank, const Shape &inpu
         out_offset += padding_list[1].first;
 
         // copy a row of input data
-        memcpy(output_data + out_offset, input_data + in_offset, in_row_len * sizeof(float));
+        memcpy(output_data + out_offset, input_data + in_offset, in_row_len * sizeof(T));
 
         out_offset += in_row_len;
 
@@ -132,7 +133,7 @@ inline void Pad(const int32_t *padding_data, int32_t pad_rank, const Shape &inpu
           out_offset += padding_list[2].first;
 
           // copy a row of input data
-          memcpy(output_data + out_offset, input_data + in_offset, in_row_len * sizeof(float));
+          memcpy(output_data + out_offset, input_data + in_offset, in_row_len * sizeof(T));
 
           out_offset += in_row_len;
 
@@ -191,7 +192,7 @@ inline void Pad(const int32_t *padding_data, int32_t pad_rank, const Shape &inpu
             out_c_offset += padding_list[3].first;
 
             // copy a row of input data
-            memcpy(output_data + out_c_offset, input_data + in_offset, in_row_len * sizeof(float));
+            memcpy(output_data + out_c_offset, input_data + in_offset, in_row_len * sizeof(T));
 
             out_c_offset += in_row_len;
 
diff --git a/compute/cker/include/cker/operation/Quantize.h b/compute/cker/include/cker/operation/Quantize.h
new file mode 100644
index 0000000..5c82d11
--- /dev/null
+++ b/compute/cker/include/cker/operation/Quantize.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_QUANTIZE_H__
+#define __NNFW_CKER_QUANTIZE_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+#include <stdexcept>
+#include <iostream>
+namespace nnfw
+{
+namespace cker
+{
+template <typename InputT, typename OutputT>
+inline void Quantize(const Shape &input_shape, const InputT *input_data, const Shape &output_shape,
+                     OutputT *output_data, const float output_scale, const int32_t output_offset)
+{
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  int min_val = std::numeric_limits<OutputT>::min();
+  int max_val = std::numeric_limits<OutputT>::max();
+
+  for (int i = 0; i < flat_size; i++)
+  {
+    int32_t unclamped = static_cast<int32_t>(round(input_data[i] / output_scale)) + output_offset;
+    int32_t clamped = std::min(std::max(unclamped, min_val), max_val);
+    output_data[i] = clamped;
+  }
+}
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_QUANTIZE_H__
diff --git a/compute/cker/include/cker/operation/SpaceToDepth.h b/compute/cker/include/cker/operation/SpaceToDepth.h
new file mode 100644
index 0000000..ef67931
--- /dev/null
+++ b/compute/cker/include/cker/operation/SpaceToDepth.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_SPACE_TO_DEPTH_H__
+#define __NNFW_CKER_SPACE_TO_DEPTH_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+template <typename T>
+inline void SpaceToDepth(const SpaceToDepthParams &params, const Shape &unextended_input_shape,
+                         const T *input_data, const Shape &unextended_output_shape, T *output_data)
+{
+  assert(unextended_input_shape.DimensionsCount() <= 4);
+  assert(unextended_output_shape.DimensionsCount() <= 4);
+  const Shape input_shape = Shape::ExtendedShape(4, unextended_input_shape);
+  const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape);
+
+  const int output_depth = output_shape.Dims(3);
+  const int output_width = output_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+
+  const int input_depth = input_shape.Dims(3);
+  const int batch_size = input_shape.Dims(0);
+
+  // Number of continuous values that we can copy in one interation.
+  const int stride = params.block_size * input_depth;
+
+  for (int batch = 0; batch < batch_size; ++batch)
+  {
+    for (int out_h = 0; out_h < output_height; ++out_h)
+    {
+      T *output_ptr = output_data + Offset(output_shape, batch, out_h, 0, 0);
+      for (int offset_h = 0; offset_h < params.block_size; ++offset_h)
+      {
+        T *dst = output_ptr;
+        for (int out_w = 0; out_w < output_width; ++out_w)
+        {
+          memcpy(dst, input_data, stride * sizeof(T));
+          input_data += stride;
+          dst += output_depth;
+        }
+        output_ptr += stride;
+      }
+    }
+  }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_SPACE_TO_DEPTH_H__
diff --git a/compute/cker/include/cker/ruy/RuySupport.h b/compute/cker/include/cker/ruy/RuySupport.h
index 432b181..080f66f 100644
--- a/compute/cker/include/cker/ruy/RuySupport.h
+++ b/compute/cker/include/cker/ruy/RuySupport.h
@@ -24,7 +24,7 @@
 
 namespace
 {
-const int kDefaultNumThreadpoolThreads = 4;
+const int kDefaultNumThreadpoolThreads = 1;
 }
 
 namespace nnfw
diff --git a/docs/howto/how-to-build-runtime.md b/docs/howto/how-to-build-runtime.md
index 2bfd14c..657f0f7 100644
--- a/docs/howto/how-to-build-runtime.md
+++ b/docs/howto/how-to-build-runtime.md
@@ -13,7 +13,7 @@ In the Ubuntu, you can easily install it with the following command.
 
 ```
 $ sudo apt-get install cmake libboost-all-dev
-``` 
+```
 
 If your linux system does not have the basic development configuration, you will need to install more packages. A list of all packages needed to configure the development environment can be found in the https://github.com/Samsung/ONE/blob/master/infra/docker/Dockerfile.1804 file.
 
@@ -44,7 +44,7 @@ python3-venv \
 scons \
 software-properties-common \
 unzip \
-wget 
+wget
 
 $ mkdir /tmp/gtest
 $ cd /tmp/gtest
@@ -63,7 +63,7 @@ In a typical linux development environment, including Ubuntu, you can build the
 ```
 $ git clone https://github.com/Samsung/ONE.git one
 $ cd one
-$ cp -n Makefile.template Makefile; make install
+$ make -f Makefile.template install
 ```
 
 Unfortunately, the debug build on the x86_64 architecture currently has an error. To solve the problem, you must use gcc version 9 or higher. Another workaround is to do a release build rather than a debug build. This is not a suitable method for debugging during development, but it is enough to check the function of the runtime. To release build the runtime, add the environment variable `BUILD_TYPE=release` to the build command as follows.
diff --git a/docs/nnfw/howto/CrossBuildForAndroid.md b/docs/nnfw/howto/CrossBuildForAndroid.md
index d7e48c8..08d5fd6 100644
--- a/docs/nnfw/howto/CrossBuildForAndroid.md
+++ b/docs/nnfw/howto/CrossBuildForAndroid.md
@@ -44,11 +44,9 @@ Different from cross build for linux,
 Here is an example of using Makefile.
 
 ```bash
-cp -n Makefile.template Makefile
-
 TARGET_OS=android \
 CROSS_BUILD=1 \
 NDK_DIR=/path/android-tools/r20/ndk \
 EXT_ACL_FOLDER=/path/arm_compute-v19.11.1-bin-android/lib/android-arm64-v8a-neon-cl \
-make install
+make -f Makefile.template install
 ```
diff --git a/docs/runtime/core.md b/docs/runtime/core.md
index 42ba75f..64a6c62 100644
--- a/docs/runtime/core.md
+++ b/docs/runtime/core.md
@@ -68,7 +68,7 @@ Let's say we have some functions written in a certain programming language. Then
 
 With generated tensors and kernels, the compiler creates executor objects. There are 3 types of executors are supported - Linear, Dataflow, and Parallel. Linear executor is the default executor and Dataflow Executor and Parallel Executor are experimental.
 
-For more about executors, please refer to [Executors](./executors.md) document.
+For more about executors, please refer to [Executors](executors.md) document.
 
 ### Module `exec`
 
@@ -83,4 +83,4 @@ For more about executors, please refer to [Executors](./executors.md) document.
 
 Backends are plugins and they are loaded dynamically(via `dlopen`). So this module is a set of interface classes for backend implementation. `compiler` can compile with a variety of backends without knowing specific backend implementation.
 
-Backend interface classes are mostly about memory management and kernel generation. For more, please refer to [Backend API](./backend-api.md) document.
+Backend interface classes are mostly about memory management and kernel generation. For more, please refer to [Backend API](backend-api.md) document.
diff --git a/docs/runtime/heterogeneous-execution.md b/docs/runtime/heterogeneous-execution.md
index dc39dae..e7a5e27 100644
--- a/docs/runtime/heterogeneous-execution.md
+++ b/docs/runtime/heterogeneous-execution.md
@@ -12,11 +12,11 @@ Here is another case. Let's say we have a model that is not sequential so there
 
 ![Add-3Conv model](heterogeneous-execution-add-3-conv-model.png)
 
-Say we have 3 backends that are based on CPU, GPU and NPU(Neural Processing Unit) respectively. After executing Add, 3 Conv2D operations are ready to run. We may utilize those backends with [Parallel Executor (experimental)](./executors.md#parallel-executor-experimental). For this case we may get performance gain regardless of kernels' speed as those are run in parallel independently.
+Say we have 3 backends that are based on CPU, GPU and NPU(Neural Processing Unit) respectively. After executing Add, 3 Conv2D operations are ready to run. We may utilize those backends with [Parallel Executor (experimental)](executors.md#parallel-executor-experimental). For this case we may get performance gain regardless of kernels' speed as those are run in parallel independently.
 
 ## Graph Transformation
 
-Unfortunately it is not that simple to get performance gain. As each backend has its own memory management module, a copy must be done between backend boundaries. Plus, it may require layout changes so "Permute" operations are added from  `PermutationInsertionPass`. This process is done from [Lowering](./core.md#1-lowering) phase of compilation.
+Unfortunately it is not that simple to get performance gain. As each backend has its own memory management module, a copy must be done between backend boundaries. Plus, it may require layout changes so "Permute" operations are added from  `PermutationInsertionPass`. This process is done from [Lowering](core.md#1-lowering) phase of compilation.
 
 Here is an example of that. Let's say we have assigned different backends for Add and Conv2D. So a Permute operation is inserted between them.
 
diff --git a/infra/cmake/packages/ARMComputeSourceConfig.cmake b/infra/cmake/packages/ARMComputeSourceConfig.cmake
index 51a235a..adec1f9 100644
--- a/infra/cmake/packages/ARMComputeSourceConfig.cmake
+++ b/infra/cmake/packages/ARMComputeSourceConfig.cmake
@@ -8,7 +8,7 @@ function(_ARMComputeSource_import)
   nnas_include(OptionTools)
 
   envoption(EXTERNAL_DOWNLOAD_SERVER "https://github.com")
-  set(ARMCOMPUTE_URL ${EXTERNAL_DOWNLOAD_SERVER}/ARM-software/ComputeLibrary/archive/v19.11.1.tar.gz)
+  set(ARMCOMPUTE_URL ${EXTERNAL_DOWNLOAD_SERVER}/ARM-software/ComputeLibrary/archive/v20.05.tar.gz)
   ExternalSource_Download(ARMCOMPUTE ${ARMCOMPUTE_URL})
 
   set(ARMComputeSource_DIR ${ARMCOMPUTE_SOURCE_DIR} PARENT_SCOPE)
diff --git a/infra/cmake/packages/FlatBuffersConfig.cmake b/infra/cmake/packages/FlatBuffersConfig.cmake
index ab0b770..da084e7 100644
--- a/infra/cmake/packages/FlatBuffersConfig.cmake
+++ b/infra/cmake/packages/FlatBuffersConfig.cmake
@@ -25,7 +25,8 @@ function(_FlatBuffers_build)
                       BUILD_DIR   ${CMAKE_BINARY_DIR}/externals/FLATBUFFERS/build
                       INSTALL_DIR ${EXT_OVERLAY_DIR}
                       BUILD_FLAGS ${ADDITIONAL_CXX_FLAGS}
-                      IDENTIFIER  "1.10-fix1"
+                      IDENTIFIER  "1.10-fix2"
+                      EXTRA_OPTS "-DFLATBUFFERS_BUILD_TESTS:BOOL=OFF"
                       PKG_NAME    "FLATBUFFERS")
 
 endfunction(_FlatBuffers_build)
diff --git a/infra/cmake/packages/HDF5Config.cmake b/infra/cmake/packages/HDF5Config.cmake
index e282e0b..19803f1 100644
--- a/infra/cmake/packages/HDF5Config.cmake
+++ b/infra/cmake/packages/HDF5Config.cmake
@@ -27,6 +27,7 @@ _HDF5_build()
 find_path(HDF5_CONFIG_DIR "hdf5-config.cmake"
           PATHS ${EXT_OVERLAY_DIR}
           PATH_SUFFIXES
+            cmake
             share/cmake
             share/cmake/hdf5
             cmake/hdf5
diff --git a/infra/cmake/packages/Pybind11Config.cmake b/infra/cmake/packages/Pybind11Config.cmake
new file mode 100644
index 0000000..3061779
--- /dev/null
+++ b/infra/cmake/packages/Pybind11Config.cmake
@@ -0,0 +1,21 @@
+function(_Pybind11_import)
+  nnas_find_package(Pybind11Source QUIET)
+
+  if(NOT Pybind11Source_FOUND)
+    set(Pybind11_FOUND FALSE PARENT_SCOPE)
+    return()
+  endif(NOT Pybind11Source_FOUND)
+
+  nnas_include(ExternalBuildTools)
+  ExternalBuild_CMake(CMAKE_DIR   ${Pybind11Source_DIR}
+                      BUILD_DIR   ${CMAKE_BINARY_DIR}/externals/PYBIND11/build
+                      INSTALL_DIR ${EXT_OVERLAY_DIR}
+                      IDENTIFIER  "2.3.0"
+                      PKG_NAME    "PYBIND11")
+
+  find_path(Pybind11_INCLUDE_DIRS NAMES pybind11.h PATHS ${EXT_OVERLAY_DIR} PATH_SUFFIXES include/pybind11)
+
+  set(Pybind11_FOUND TRUE PARENT_SCOPE)
+endfunction(_Pybind11_import)
+
+_Pybind11_import()
diff --git a/infra/cmake/packages/Pybind11SourceConfig.cmake b/infra/cmake/packages/Pybind11SourceConfig.cmake
new file mode 100644
index 0000000..4a9c676
--- /dev/null
+++ b/infra/cmake/packages/Pybind11SourceConfig.cmake
@@ -0,0 +1,18 @@
+function(_Pybind11Source_import)
+  if(NOT DOWNLOAD_PYBIND11)
+    set(Pybind11Source_FOUND FALSE PARENT_SCOPE)
+    return()
+  endif(NOT DOWNLOAD_PYBIND11)
+
+  nnas_include(ExternalSourceTools)
+  nnas_include(OptionTools)
+
+  envoption(PYBIND11_URL https://github.com/pybind/pybind11/archive/v2.3.0.tar.gz)
+
+  ExternalSource_Download(PYBIND11 ${PYBIND11_URL})
+
+  set(Pybind11Source_DIR ${PYBIND11_SOURCE_DIR} PARENT_SCOPE)
+  set(Pybind11Source_FOUND TRUE PARENT_SCOPE)
+endfunction(_Pybind11Source_import)
+
+_Pybind11Source_import()
diff --git a/infra/docker/Dockerfile b/infra/docker/Dockerfile
index e675b53..052cc4f 100644
--- a/infra/docker/Dockerfile
+++ b/infra/docker/Dockerfile
@@ -1,8 +1,6 @@
 FROM ubuntu:16.04
 
 ARG UBUNTU_MIRROR
-ENV http_proxy $http_proxy
-ENV https_proxy $https_proxy
 
 RUN if [ -n "$http_proxy" ] ; then echo "Acquire::http::proxy \"${http_proxy}\";" >> /etc/apt/apt.conf ; fi
 RUN if [ -n "$https_proxy" ] ; then echo "Acquire::https::proxy \"${https_proxy}\";" >> /etc/apt/apt.conf ; fi
@@ -22,6 +20,7 @@ RUN apt-get update && apt-get -qqy install libprotobuf-dev protobuf-compiler
 
 # Additonal tools
 RUN apt-get update && apt-get -qqy install doxygen graphviz wget unzip clang-format-3.9 python3 python3-pip python3-venv hdf5-tools pylint
+RUN pip3 install --upgrade pip
 RUN pip3 install yapf==0.22.0 numpy
 
 # Install google test (source)
diff --git a/infra/docker/Dockerfile.1804 b/infra/docker/Dockerfile.1804
index fc6fc9a..cc31bba 100644
--- a/infra/docker/Dockerfile.1804
+++ b/infra/docker/Dockerfile.1804
@@ -1,12 +1,6 @@
 FROM ubuntu:18.04
 
 ARG UBUNTU_MIRROR
-ENV http_proxy $http_proxy
-ENV https_proxy $https_proxy
-
-RUN if [ -n "$http_proxy" ] ; then echo "Acquire::http::proxy \"${http_proxy}\";" >> /etc/apt/apt.conf ; fi
-RUN if [ -n "$https_proxy" ] ; then echo "Acquire::https::proxy \"${https_proxy}\";" >> /etc/apt/apt.conf ; fi
-RUN if [ -n "$UBUNTU_MIRROR" ] ; then sed "s/archive.ubuntu.com/${UBUNTU_MIRROR}/g" -i /etc/apt/sources.list ; fi
 
 # Install 'add-apt-repository'
 RUN apt-get update && apt-get -qqy install software-properties-common
@@ -22,6 +16,7 @@ RUN apt-get update && apt-get -qqy install libprotobuf-dev protobuf-compiler
 
 # Additonal tools
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get -qqy install doxygen graphviz wget unzip clang-format-3.9 python3 python3-pip python3-venv hdf5-tools pylint
+RUN pip3 install --upgrade pip
 RUN pip3 install yapf==0.22.0 numpy
 
 # Install google test (source)
diff --git a/infra/nncc/CMakeLists.txt b/infra/nncc/CMakeLists.txt
index 3ac6680..0be6885 100644
--- a/infra/nncc/CMakeLists.txt
+++ b/infra/nncc/CMakeLists.txt
@@ -98,6 +98,7 @@ option(DOWNLOAD_CAFFE "Download Caffe source" ON)
 option(DOWNLOAD_PYTORCH "Download Pytorch source" ON)
 option(DOWNLOAD_ONNX "Download ONNX source" ON)
 option(DOWNLOAD_ABSEIL "Download Abseil-cpp source" ON)
+option(DOWNLOAD_PYBIND11 "Download Pybind11 source" ON)
 
 option(DOWNLOAD_GTEST "Download Google Test source" ON)
 option(BUILD_GTEST "Build Google Test from the downloaded source" ON)
diff --git a/infra/nncc/command/utcount b/infra/nncc/command/utcount
index d4610e3..d06c5c9 100644
--- a/infra/nncc/command/utcount
+++ b/infra/nncc/command/utcount
@@ -13,7 +13,7 @@ BUILD_ITEMS="angkor cwrap pepper-str pepper-strcast pp stdex \
 oops pepper-assert \
 hermes hermes-std \
 loco locop locomotiv logo-core logo \
-foder souschef arser \
+foder souschef arser vconone \
 safemain mio-circle mio-tflite \
 tflite2circle \
 luci \
diff --git a/infra/nnfw/cmake/packages/TensorFlowLite-2.2.0/CMakeLists.txt b/infra/nnfw/cmake/packages/TensorFlowLite-2.2.0/CMakeLists.txt
index 8e7f78e..2442a2d 100644
--- a/infra/nnfw/cmake/packages/TensorFlowLite-2.2.0/CMakeLists.txt
+++ b/infra/nnfw/cmake/packages/TensorFlowLite-2.2.0/CMakeLists.txt
@@ -100,7 +100,7 @@ target_include_directories(tensorflow-lite-2.2.0 SYSTEM PUBLIC ${TFLITE_INCLUDES
 target_compile_definitions(tensorflow-lite-2.2.0 PUBLIC "GEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK -DTFLITE_WITH_RUY -DTFLITE_WITH_RUY_GEMV")
 set_property(TARGET tensorflow-lite-2.2.0 PROPERTY POSITION_INDEPENDENT_CODE ON)
 target_link_libraries(tensorflow-lite-2.2.0 eigen ${LIB_PTHREAD} dl)
-if(${BUILD_WITH_NNAPI})
+if(NOT ANDROID AND ${BUILD_WITH_NNAPI})
   target_link_libraries(tensorflow-lite-2.2.0 rt)
 endif()
 
diff --git a/infra/nnfw/config/gbs.conf b/infra/nnfw/config/gbs.conf
index 515cada..bad9eb2 100644
--- a/infra/nnfw/config/gbs.conf
+++ b/infra/nnfw/config/gbs.conf
@@ -5,7 +5,7 @@ profile = profile.tizen
 [profile.tizen]
 user=obs_viewer
 obs = obs.tizen
-repos = repo.tizen_base,repo.tizen_mobile
+repos = repo.tizen_one,repo.tizen_base,repo.tizen_mobile
 buildroot = /home/GBS-ROOT/
 
 [obs.tizen]
@@ -15,6 +15,8 @@ url = http://api.tizen.org
 url = http://download.tizen.org/snapshots/tizen/unified/latest/repos/standard/packages/
 
 [repo.tizen_base]
-url =  http://download.tizen.org/snapshots/tizen/base/latest/repos/standard/packages/
+url = http://download.tizen.org/snapshots/tizen/base/latest/repos/standard/packages/
 
+[repo.tizen_one]
+url = http://nnfw.mooo.com/archive/tizen/
 
diff --git a/infra/packaging/preset/20200630 b/infra/packaging/preset/20200630
index e159935..c3ca4b6 100644
--- a/infra/packaging/preset/20200630
+++ b/infra/packaging/preset/20200630
@@ -14,6 +14,7 @@ function preset_configure()
   REQUIRED_UNITS+=("souschef")
   REQUIRED_UNITS+=("safemain")
   REQUIRED_UNITS+=("arser")
+  REQUIRED_UNITS+=("vconone")
   # Hermes Logging Framework
   REQUIRED_UNITS+=("hermes" "hermes-std")
   # loco IR and related utilities
@@ -28,11 +29,14 @@ function preset_configure()
   REQUIRED_UNITS+=("record-minmax" "circle-quantizer")
   REQUIRED_UNITS+=("one-cmds")
 
+  NPROC=${NPROC:-$(cat /proc/cpuinfo | grep -c processor)}
+
   # TODO Use "nncc configure" and "nncc build"
   cmake \
     -DCMAKE_INSTALL_PREFIX="${NNCC_INSTALL_PREFIX}" \
     -DCMAKE_BUILD_TYPE=release \
     -DBUILD_WHITELIST=$(join_by ";" "${REQUIRED_UNITS[@]}") \
+    -DEXTERNALS_BUILD_THREADS=$((NPROC/2)) \
     ${EXTRA_OPTIONS[@]} \
     "${NNAS_PROJECT_PATH}/infra/nncc"
 }
@@ -44,14 +48,4 @@ function preset_install()
 
   # Install tf2nnpkg
   install -T -m 755 -D "${SCRIPT_PATH}/res/tf2nnpkg.${PRESET}" "${NNAS_INSTALL_PREFIX}/bin/tf2nnpkg"
-
-  # Create python virtual enviornment
-  python3 -m venv "${NNAS_INSTALL_PREFIX}/bin/venv"
-
-  # Install tensorflow
-  source "${NNAS_INSTALL_PREFIX}/bin/venv/bin/activate"
-  python -m pip --default-timeout=1000 --trusted-host pypi.org --trusted-host files.pythonhost.org \
-    install -U pip setuptools
-  python -m pip --default-timeout=1000 --trusted-host pypi.org --trusted-host files.pythonhost.org \
-    install tensorflow-cpu==2.3.0rc0
 }
diff --git a/infra/packaging/res/tf2nnpkg.20200630 b/infra/packaging/res/tf2nnpkg.20200630
index 9101f82..7846fd3 100644
--- a/infra/packaging/res/tf2nnpkg.20200630
+++ b/infra/packaging/res/tf2nnpkg.20200630
@@ -14,10 +14,16 @@ command_exists() {
 usage()
 {
   echo "Convert TensorFlow model to nnpackage."
-  echo "Usage: tf2nnpkg --info <path/to/info> --graphdef <path/to/pb> [OPTION] -o <path/to/nnpkg/directory>"
-  exit 0
+  echo "Usage: tf2nnpkg"
+  echo "    --info <path/to/info>"
+  echo "    --graphdef <path/to/pb>"
+  echo "    -o <path/to/nnpkg/directory>"
+  echo "    --v2 (optional) Use TF 2.x interface"
+  exit 255
 }
 
+TF_INTERFACE="--v1"
+
 # Parse command-line arguments
 #
 while [ "$#" -ne 0 ]; do
@@ -39,6 +45,10 @@ while [ "$#" -ne 0 ]; do
       export OUTPUT_DIR="$2"
       shift 2
       ;;
+    '--v2')
+      TF_INTERFACE="--v2"
+      shift
+      ;;
     *)
       echo "${CUR}"
       shift
@@ -83,10 +93,7 @@ OUTPUT=$(awk -F, '/^output/ { print $2 }' ${INFO_FILE} | cut -d: -f1 | tr -d ' '
 INPUT_SHAPES=$(grep ^input ${INFO_FILE} | cut -d "[" -f2 | cut -d "]" -f1 | tr -d ' ' | xargs | tr ' ' ':')
 
 # generate tflite file
-python "${ROOT}/bin/tf2tfliteV2.py" --v2 --input_path ${GRAPHDEF_FILE} \
---output_path "${TMPDIR}/${MODEL_NAME}.tflite" \
---input_arrays ${INPUT} --output_arrays ${OUTPUT} || \
-python "${ROOT}/bin/tf2tfliteV2.py" --v1 --input_path ${GRAPHDEF_FILE} \
+python "${ROOT}/bin/tf2tfliteV2.py" ${TF_INTERFACE} --input_path ${GRAPHDEF_FILE} \
 --output_path "${TMPDIR}/${MODEL_NAME}.tflite" \
 --input_arrays ${INPUT} --input_shapes ${INPUT_SHAPES} \
 --output_arrays ${OUTPUT}
diff --git a/infra/scripts/build-tcm.sh b/infra/scripts/build-tcm.sh
new file mode 100644
index 0000000..22fb335
--- /dev/null
+++ b/infra/scripts/build-tcm.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+#
+# STEP 1
+#   Download latest TCM tool from 
+#   https://github.sec.samsung.net/RS-TCM/tca-standalone/releases/download/v0.0.8/tca-standalone-0.0.8.jar
+#
+# STEP 2
+#   Create symbolic link `./src` for source directory to be analyzed which has `.ahub` configuration.
+#
+# STEP 3
+#   run this `build-tcm.sh` script.
+#
+# See the following link for additional details.
+#   https://github.sec.samsung.net/RS-TCM/tca-standalone/wiki/Tutorials-CPP-Gtest
+#
+
+echo ${PROJECT_DIR:=${PWD}}
+
+java -jar $PROJECT_DIR/tca-standalone-0.0.8.jar \
+  --outdir=$PROJECT_DIR/tcm-output \
+  --config=$PROJECT_DIR/.ahub/tcchecker-tca/config.yaml \
+  --local=$PROJECT_DIR/src \
+  --logfile=$PROJECT_DIR/tcm-output/tcm.log \
+  --debug
diff --git a/infra/scripts/compiler_modules.sh b/infra/scripts/compiler_modules.sh
index d436e8a..a0323e0 100644
--- a/infra/scripts/compiler_modules.sh
+++ b/infra/scripts/compiler_modules.sh
@@ -7,7 +7,7 @@ DEBUG_BUILD_ITEMS="angkor;cwrap;pepper-str;pepper-strcast;pp;stdex"
 DEBUG_BUILD_ITEMS+=";oops;pepper-assert"
 DEBUG_BUILD_ITEMS+=";hermes;hermes-std"
 DEBUG_BUILD_ITEMS+=";loco;locop;locomotiv;logo-core;logo"
-DEBUG_BUILD_ITEMS+=";foder;souschef;arser"
+DEBUG_BUILD_ITEMS+=";foder;souschef;arser;vconone"
 DEBUG_BUILD_ITEMS+=";safemain;mio-circle;mio-tflite"
 DEBUG_BUILD_ITEMS+=";tflite2circle"
 DEBUG_BUILD_ITEMS+=";luci"
diff --git a/infra/scripts/docker_build_cross_aarch64_runtime.sh b/infra/scripts/docker_build_cross_aarch64_runtime.sh
index 7da6736..011d14c 100755
--- a/infra/scripts/docker_build_cross_aarch64_runtime.sh
+++ b/infra/scripts/docker_build_cross_aarch64_runtime.sh
@@ -6,7 +6,7 @@ CURRENT_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 ROOT_PATH="$CURRENT_PATH/../../"
 
 # prepare rootfs
-if [ ! -d $ROOTFS_DIR ]; then
+if [ -z "$ROOTFS_DIR" ] || [ ! -d $ROOTFS_DIR ]; then
   echo "It will use default rootfs path"
 else
   DOCKER_VOLUMES+=" -v $ROOTFS_DIR:/opt/rootfs"
diff --git a/infra/scripts/docker_build_cross_arm_runtime.sh b/infra/scripts/docker_build_cross_arm_runtime.sh
index f1f666a..551fb57 100755
--- a/infra/scripts/docker_build_cross_arm_runtime.sh
+++ b/infra/scripts/docker_build_cross_arm_runtime.sh
@@ -6,7 +6,7 @@ CURRENT_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 ROOT_PATH="$CURRENT_PATH/../../"
 
 # prepare rootfs
-if [ ! -d $ROOTFS_DIR ]; then
+if [ -z "$ROOTFS_DIR" ] || [ ! -d $ROOTFS_DIR ]; then
   echo "It will use default rootfs path"
 else
   DOCKER_VOLUMES+=" -v $ROOTFS_DIR:/opt/rootfs"
diff --git a/infra/scripts/docker_build_cross_arm_runtime_release.sh b/infra/scripts/docker_build_cross_arm_runtime_release.sh
index ea66f17..876f318 100755
--- a/infra/scripts/docker_build_cross_arm_runtime_release.sh
+++ b/infra/scripts/docker_build_cross_arm_runtime_release.sh
@@ -6,7 +6,7 @@ CURRENT_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 ROOT_PATH="$CURRENT_PATH/../../"
 
 # prepare rootfs
-if [ ! -d $ROOTFS_DIR ]; then
+if [ -z "$ROOTFS_DIR" ] || [ ! -d $ROOTFS_DIR ]; then
   echo "It will use default rootfs path"
 else
   DOCKER_VOLUMES+=" -v $ROOTFS_DIR:/opt/rootfs"
diff --git a/infra/scripts/docker_build_cross_coverage.sh b/infra/scripts/docker_build_cross_coverage.sh
index 08244e5..f42251b 100755
--- a/infra/scripts/docker_build_cross_coverage.sh
+++ b/infra/scripts/docker_build_cross_coverage.sh
@@ -6,7 +6,7 @@ CURRENT_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 ROOT_PATH="$CURRENT_PATH/../../"
 
 # prepare rootfs
-if [ ! -d $ROOTFS_DIR ]; then
+if [ -z "$ROOTFS_DIR" ] || [ ! -d $ROOTFS_DIR ]; then
   echo "It will use default rootfs path"
 else
   DOCKER_VOLUMES+=" -v $ROOTFS_DIR:/opt/rootfs"
diff --git a/infra/scripts/docker_build_nncc.sh b/infra/scripts/docker_build_nncc.sh
index 418b50d..5b12531 100755
--- a/infra/scripts/docker_build_nncc.sh
+++ b/infra/scripts/docker_build_nncc.sh
@@ -54,6 +54,16 @@ pushd $ROOT_PATH > /dev/null
 mkdir -p ${NNCC_INSTALL_PREFIX}
 ./nncc docker-run ./nnas create-package --prefix "${PWD}/${NNCC_INSTALL_PREFIX}" -- "${CONFIG_OPTIONS}"
 
+# create python virtual environment
+./nncc docker-run python3 -m venv "${NNCC_INSTALL_PREFIX}/bin/venv"
+
+./nncc docker-run "${NNCC_INSTALL_PREFIX}/bin/venv/bin/python" \
+  -m pip --default-timeout=1000 --trusted-host pypi.org --trusted-host files.pythonhost.org \
+  install -U pip setuptools
+./nncc docker-run "${NNCC_INSTALL_PREFIX}/bin/venv/bin/python" \
+  -m pip --default-timeout=1000 --trusted-host pypi.org --trusted-host files.pythonhost.org \
+  install tensorflow-cpu==2.3.0rc0
+
 mkdir -p ${ARCHIVE_PATH}
 tar -zcf ${ARCHIVE_PATH}/nncc-package.tar.gz -C ${NNCC_INSTALL_PREFIX} ./
 
diff --git a/infra/scripts/docker_build_tizen_cross.sh b/infra/scripts/docker_build_tizen_cross.sh
index 18809ad..ee0f183 100755
--- a/infra/scripts/docker_build_tizen_cross.sh
+++ b/infra/scripts/docker_build_tizen_cross.sh
@@ -6,7 +6,7 @@ CURRENT_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 ROOT_PATH="$CURRENT_PATH/../../"
 
 # prepare rootfs
-if [ ! -d $ROOTFS_DIR ]; then
+if [ -z "$ROOTFS_DIR" ] || [ ! -d $ROOTFS_DIR ]; then
   echo "It will use default rootfs path"
 else
   DOCKER_VOLUMES+=" -v $ROOTFS_DIR:/opt/rootfs"
diff --git a/infra/scripts/docker_collect_nnpkg_resources.sh b/infra/scripts/docker_collect_nnpkg_resources.sh
index 556c5bd..55adaa1 100755
--- a/infra/scripts/docker_collect_nnpkg_resources.sh
+++ b/infra/scripts/docker_collect_nnpkg_resources.sh
@@ -60,7 +60,7 @@ pushd $ROOT_PATH > /dev/null
 REQUIRED_UNITS=()
 # Common Libraries
 REQUIRED_UNITS+=("angkor" "cwrap" "pepper-str" "pepper-strcast" "pp" "stdex")
-REQUIRED_UNITS+=("oops" "safemain" "foder" "arser" "oops")
+REQUIRED_UNITS+=("oops" "safemain" "foder" "arser" "vconone")
 # Hermes Logging Framework
 REQUIRED_UNITS+=("hermes" "hermes-std")
 # loco IR and related utilities
diff --git a/infra/scripts/tizen_xu4_test.sh b/infra/scripts/tizen_xu4_test.sh
index 5521b5f..640a0e0 100755
--- a/infra/scripts/tizen_xu4_test.sh
+++ b/infra/scripts/tizen_xu4_test.sh
@@ -23,7 +23,7 @@ function install_model()
 {
     # download tflite model files
     pushd $HOST_HOME
-    tests/scripts/framework/run_test.sh --download=on
+    tests/scripts/framework/run_test.sh --download=on --run=off
     # TODO Since this command removes model file(.zip),
     # We must always download the file unlike model file(.tflite).
     # Because caching applies only to tflite file.
diff --git a/master_diff_1.7.0.patch b/master_diff_1.7.0.patch
new file mode 100644
index 0000000..feae398
--- /dev/null
+++ b/master_diff_1.7.0.patch
@@ -0,0 +1,30424 @@
+diff --git a/.ahub/tcchecker-tca/config.yaml b/.ahub/tcchecker-tca/config.yaml
+new file mode 100644
+index 0000000..cd34d79
+--- /dev/null
++++ b/.ahub/tcchecker-tca/config.yaml
+@@ -0,0 +1,43 @@
++version: 2
++test:
++  - name: NN Runtime
++    testCaseLanguage: CPP
++    testFW: GTEST
++    testCaseFolder:
++      - ./compute/test/cker
++      - ./runtime/onert/core/src/backend/cpu_common
++      - ./runtime/onert/frontend/nnapi
++      - ./runtime/onert/test/core/compiler
++      - ./runtime/onert/test/core/exec
++      - ./runtime/onert/test/core/interp
++      - ./runtime/onert/test/graph
++      - ./runtime/onert/test/graph/operand
++      - ./runtime/onert/test/graph/operation
++      - ./runtime/onert/test/graph/verifier
++      - ./runtime/onert/test/ir
++      - ./runtime/onert/test/util
++      - ./tests/nnapi/src
++      - ./tests/nnfw_api/src
++      - ./tests/tools/tflite_run/src
++
++    testFile:
++      - extension: cpp
++        any: true
++      - extension: cc
++        any: true
++
++    testCase:
++      - condition:
++        - functionName:
++            starts:
++              - TEST
++ 
++    negativeTestCase:
++      - condition:
++        - testName:
++            starts:
++              - neg_
++
++    positiveTestCase:
++      - condition:
++        - inverse: negativeTestCase
+diff --git a/compiler/.ahub/tcchecker-tca/config.yaml b/compiler/.ahub/tcchecker-tca/config.yaml
+new file mode 100644
+index 0000000..ef681de
+--- /dev/null
++++ b/compiler/.ahub/tcchecker-tca/config.yaml
+@@ -0,0 +1,54 @@
++version: 2
++test:
++  - name: NN Compiler
++    testCaseLanguage: CPP
++    testFW: GTEST
++    testCaseFolder:
++      - ./angkor
++      - ./arser
++      - ./circle2circle
++      - ./circle-quantizer
++      - ./cwrap
++      - ./foder
++      - ./hermes
++      - ./hermes-std
++      - ./loco
++      - ./locomotiv
++      - ./locop
++      - ./logo
++      - ./logo-core
++      - ./luci
++      - ./luci-interpreter
++      - ./luci-value-test
++      - ./mio-circle
++      - ./mio-tflite
++      - ./oops
++      - ./pepper-assert
++      - ./pepper-str
++      - ./pepper-strcast
++      - ./pp
++      - ./record-minmax
++      - ./safemain
++      - ./souschef
++      - ./stdex
++      - ./tflite2circle
++
++    testFile:
++      - extension: .test.cpp
++        any: true
++
++    testCase:
++      - condition:
++        - functionName:
++            starts:
++              - TEST
++
++    negativeTestCase:
++      - condition:
++        - testName:
++            ends:
++              - _NEG
++
++    positiveTestCase:
++      - condition:
++        - inverse: negativeTestCase
+diff --git a/compiler/bcq-tools/CMakeLists.txt b/compiler/bcq-tools/CMakeLists.txt
+new file mode 100644
+index 0000000..ae231bd
+--- /dev/null
++++ b/compiler/bcq-tools/CMakeLists.txt
+@@ -0,0 +1,27 @@
++set(BCQ_TOOLS_FILES
++    generate_bcq_output_arrays
++    preserve_bcq_info
++)
++
++foreach(BCQ_TOOLS IN ITEMS ${BCQ_TOOLS_FILES})
++
++  set(BCQ_TOOLS_FILE ${BCQ_TOOLS})
++  set(BCQ_TOOLS_SRC "${CMAKE_CURRENT_SOURCE_DIR}/${BCQ_TOOLS_FILE}")
++  set(BCQ_TOOLS_BIN "${CMAKE_CURRENT_BINARY_DIR}/${BCQ_TOOLS_FILE}")
++  set(BCQ_TOOLS_TARGET "${BCQ_TOOLS}_target")
++
++  add_custom_command(OUTPUT ${BCQ_TOOLS_BIN}
++    COMMAND ${CMAKE_COMMAND} -E copy "${BCQ_TOOLS_SRC}" "${BCQ_TOOLS_BIN}"
++    DEPENDS ${BCQ_TOOLS_SRC}
++    COMMENT "Generate ${BCQ_TOOLS_BIN}"
++  )
++
++  add_custom_target(${BCQ_TOOLS_TARGET} ALL DEPENDS ${BCQ_TOOLS_BIN})
++
++  install(FILES ${BCQ_TOOLS_BIN}
++          PERMISSIONS OWNER_WRITE OWNER_READ OWNER_EXECUTE
++                      GROUP_READ GROUP_WRITE GROUP_EXECUTE
++                      WORLD_READ WORLD_EXECUTE
++          DESTINATION bin)
++
++endforeach(BCQ_TOOLS)
+diff --git a/compiler/bcq-tools/README.md b/compiler/bcq-tools/README.md
+new file mode 100644
+index 0000000..18b0f48
+--- /dev/null
++++ b/compiler/bcq-tools/README.md
+@@ -0,0 +1,78 @@
++# BCQ Tools
++
++This directory includes some tools related with BCQ.
++
++## preserve_bcq_info
++
++### Purpose
++
++`preserve_bcq_info` is for preserving constant nodes which include BCQ information.
++When `.pb` file is converted to `.tflite` file by TFlite converter, constant nodes whose values are exactly same are removed and then linked to only one representative node.
++This makes us impossible to know what constant node should be linked to a node which we want to apply BCQ.
++One of the solutions is making all the same constant nodes different by inserting unique values and ignore the newly generated unique values when BCQ fusing is applied.
++`preserve_bcq_info` will generate and insert unique dummy values to the constant nodes whose values are same not to be removed by Tensorflow Lite converter.
++As a result, BCQ information will be preserved.
++
++### How to use
++
++```bash
++preserve_bcq_info \
++--input_path /path/to/original_model.pb \
++--output_path /path/to/preserved_model.pb
++```
++
++### How it works
++
++If we add unique dummy value at the end of each constant nodes, all the constant nodes would be different. Following is an example.
++
++```
++[Original Constant Nodes]
++const(value=[1, 2, 3], name='const1')
++const(value=[1, 2, 3], name='const2')
++const(value=[1, 2, 3], name='const3')
++
++[After BCQ information preserved]
++const(value=[1, 2, 3, -1], name='const1')
++const(value=[1, 2, 3, -2], name='const2')
++const(value=[1, 2, 3, -3], name='const3')
++```
++
++For dummy values, negative values are used instead of positive values.
++This is because positive valus may be confused with original constant node values.
++For your information, unique dummy value starts from -1 and moves to -2, -3, ..., -N, where N is the number of preserved constant nodes.
++
++### Caution
++
++- Newly generated dummy values should be ignored when the constant nodes are used.
++
++## generate_bcq_output_arrays
++
++### Purpose
++
++To apply BCQ, BCQ information nodes should be designated as model output so that they are alive even after TFLite conversion is finished.
++However, there are so many nodes to designate and sometimes we cannot copy and paste all of them because the string size is too big.
++`generate_bcq_output_arrays` is for generating output_arrays, which include BCQ information nodes.
++
++### How to use
++
++```bash
++generate_bcq_output_arrays \
++--input_path /path/to/original_model.pb \
++--output_path /path/to/output_arrays.txt
++```
++
++### How it works
++
++```
++[Original BCQ information nodes]
++const(value=[1, 2, 3, -1], name='const1')
++const(value=[1, 2, 3, -2], name='const2')
++const(value=[1, 2, 3, -3], name='const3')
++
++[Generated output_arrays]
++,const1,const2,const3
++```
++
++### Caution
++
++- Generated output_arrays will be start with comma.
+diff --git a/compiler/bcq-tools/generate_bcq_output_arrays b/compiler/bcq-tools/generate_bcq_output_arrays
+new file mode 100644
+index 0000000..48e8a93
+--- /dev/null
++++ b/compiler/bcq-tools/generate_bcq_output_arrays
+@@ -0,0 +1,90 @@
++#!/usr/bin/env python3
++
++import tensorflow as tf
++
++import argparse
++import sys
++
++
++def _get_parser():
++    """
++    Returns an ArgumentParser for generating output_arrays.
++    """
++    parser = argparse.ArgumentParser(
++        description=("Command line tool to generated output_arrays of BCQ nodes"))
++
++    # Input and output path.
++    parser.add_argument(
++        "-i",
++        "--input_path",
++        type=str,
++        help="Full filepath of the input file.",
++        required=True)
++    parser.add_argument(
++        "-o",
++        "--output_path",
++        type=str,
++        help="Full filepath of the output file.",
++        required=True)
++
++    return parser
++
++
++def load_graph(frozen_graph_filename):
++    """
++    Load graph from frozen pb file
++    """
++    with tf.compat.v1.gfile.GFile(frozen_graph_filename, "rb") as f:
++        graph_def = tf.compat.v1.GraphDef()
++        graph_def.ParseFromString(f.read())
++    with tf.Graph().as_default() as graph:
++        tf.import_graph_def(graph_def, name='')
++    return graph
++
++
++def dtype2str(dtype):
++    if dtype == "int32":
++        return "TF_INT32"
++    elif dtype == "int64":
++        return "TF_INT64"
++    elif dtype == "float32":
++        return "TF_FLOAT"
++    elif dtype == "bool":
++        return "TF_BOOL"
++    else:
++        raise Exception("Not supported dtype")
++
++
++def print_output_arrays(flags):
++    graph_model = load_graph(flags.input_path)
++    graph_model_def = graph_model.as_graph_def()
++    ops = graph_model.get_operations()
++
++    output_names = [op.outputs[0].name for op in ops 
++        if op.type == "Const" and "bcqinfo_" in op.outputs[0].name]
++
++    output_arrays = ""    
++    for output_name in output_names:
++        output_arrays += ","
++
++        colon_index = output_name.find(":")
++        if colon_index == -1:
++            output_arrays += output_name
++        else:
++            output_arrays += output_name[:colon_index]
++
++    f = open(flags.output_path, 'w')
++    f.write(output_arrays)
++    f.close()
++
++
++def main():
++    # Parse argument.
++    parser = _get_parser()
++    flags = parser.parse_known_args(args=sys.argv[1:])
++
++    print_output_arrays(flags[0])
++
++
++if __name__ == "__main__":
++    main()
+diff --git a/compiler/bcq-tools/preserve_bcq_info b/compiler/bcq-tools/preserve_bcq_info
+new file mode 100644
+index 0000000..2ede8d4
+--- /dev/null
++++ b/compiler/bcq-tools/preserve_bcq_info
+@@ -0,0 +1,116 @@
++#!/usr/bin/env python3
++
++import tensorflow as tf
++import numpy as np
++
++import argparse
++import sys
++
++
++def _get_parser():
++    """
++    Returns an ArgumentParser for preserving BCQ information.
++    """
++    parser = argparse.ArgumentParser(
++        description=("Command line tool to preserve BCQ information"))
++
++    # Input and output path.
++    parser.add_argument(
++        "-i",
++        "--input_path",
++        type=str,
++        help="Full filepath of the input file.",
++        required=True)
++    parser.add_argument(
++        "-o",
++        "--output_path",
++        type=str,
++        help="Full filepath of the output file.",
++        required=True)
++
++    return parser
++
++
++def load_graph(frozen_graph_filename):
++    """
++    Load graph from frozen pb file
++    """
++    with tf.compat.v1.gfile.GFile(frozen_graph_filename, "rb") as f:
++        graph_def = tf.compat.v1.GraphDef()
++        graph_def.ParseFromString(f.read())
++    with tf.Graph().as_default() as graph:
++        tf.import_graph_def(graph_def, name='')
++    return graph
++
++
++def preserve_bcq_info(flags):
++    """
++    Generate unique dummy value from -1 to -N.
++
++    We use negative values to preserve BCQ information because
++    positive values may cause some confusion with real BCQ information values.
++    """
++
++    class UniqueValueGen:
++        def __init__(self):
++            self.unique_value = -1
++
++        def gen(self):
++            val = self.unique_value
++            self.unique_value = val - 1
++            return val
++
++    unique_value = UniqueValueGen()
++
++    original_graph_model = load_graph(flags.input_path)
++    original_graph_model_def = original_graph_model.as_graph_def()
++
++    new_graph = tf.compat.v1.GraphDef()
++    substitution_dict = {}
++
++    DT_INT32 = None  # Just for copying DT_INT32 attribute value
++
++    for node in original_graph_model_def.node:
++        if node.op == "Const":
++            # Because bcqinfo_do_w_x is BOOL type, we cannot add dummy value at the end.
++            # Therefore we should convert the type to INT32 type.
++            if "/bcqinfo_do_w_x" in node.name:
++                original_tensor = tf.make_ndarray(node.attr["value"].tensor)
++                substitution_dict[node.name] = tf.make_tensor_proto(
++                    [int(original_tensor[0]), unique_value.gen()], tf.int32)
++
++            preserved_bcqinfo_list = ["/bcqinfo_number_of_clusters", "/bcqinfo_size_of_clusters", 
++                "/bcqinfo_qbits_of_clusters"]
++
++            if any(name in node.name for name in preserved_bcqinfo_list):
++                original_tensor = tf.make_ndarray(
++                    node.attr["value"].tensor)  # variable name change
++                substitution_dict[node.name] = tf.make_tensor_proto(
++                    np.append(original_tensor, unique_value.gen()), tf.int32)
++                DT_INT32 = node.attr["dtype"]
++
++    for node in original_graph_model_def.node:
++        if node.name in substitution_dict:
++            new_node = new_graph.node.add()
++            new_node.op = "Const"
++            new_node.name = node.name
++            new_node.attr["dtype"].CopyFrom(DT_INT32)
++            new_node.attr["value"].tensor.CopyFrom(substitution_dict[node.name])
++        else:
++            new_node = new_graph.node.add()
++            new_node.CopyFrom(node)
++
++    tf.io.write_graph(new_graph, '.', flags.output_path, False)
++
++
++def main():
++    # Parse argument.
++    parser = _get_parser()
++    flags = parser.parse_known_args(args=sys.argv[1:])
++
++    # Generate a new pb file, which BCQ information is preserved.
++    preserve_bcq_info(flags[0])
++
++
++if __name__ == "__main__":
++    main()
+diff --git a/compiler/circle-quantizer/CMakeLists.txt b/compiler/circle-quantizer/CMakeLists.txt
+index 1335057..009bfab 100644
+--- a/compiler/circle-quantizer/CMakeLists.txt
++++ b/compiler/circle-quantizer/CMakeLists.txt
+@@ -13,5 +13,6 @@ target_link_libraries(circle-quantizer luci_service)
+ target_link_libraries(circle-quantizer luci_pass)
+ target_link_libraries(circle-quantizer luci_export)
+ target_link_libraries(circle-quantizer arser)
++target_link_libraries(circle-quantizer vconone)
+ 
+ install(TARGETS circle-quantizer DESTINATION bin)
+diff --git a/compiler/circle-quantizer/requires.cmake b/compiler/circle-quantizer/requires.cmake
+index 2293e53..c21e28e 100644
+--- a/compiler/circle-quantizer/requires.cmake
++++ b/compiler/circle-quantizer/requires.cmake
+@@ -5,3 +5,4 @@ require("safemain")
+ require("luci")
+ require("oops")
+ require("arser")
++require("vconone")
+diff --git a/compiler/circle-quantizer/src/CircleQuantizer.cpp b/compiler/circle-quantizer/src/CircleQuantizer.cpp
+index b56b547..8d3a80c 100644
+--- a/compiler/circle-quantizer/src/CircleQuantizer.cpp
++++ b/compiler/circle-quantizer/src/CircleQuantizer.cpp
+@@ -25,6 +25,7 @@
+ 
+ #include <oops/InternalExn.h>
+ #include <arser/arser.h>
++#include <vconone/vconone.h>
+ 
+ #include <functional>
+ #include <iostream>
+@@ -36,6 +37,12 @@ using OptionHook = std::function<int(const char **)>;
+ using Algorithms = luci::CircleOptimizer::Options::Algorithm;
+ using AlgorithmParameters = luci::CircleOptimizer::Options::AlgorithmParameters;
+ 
++void print_version(void)
++{
++  std::cout << "circle-quantizer version " << vconone::get_string() << std::endl;
++  std::cout << vconone::get_copyright() << std::endl;
++}
++
+ int entry(int argc, char **argv)
+ {
+   // Simple argument parser (based on map)
+@@ -49,13 +56,20 @@ int entry(int argc, char **argv)
+ 
+   arser::Arser arser("circle-quantizer provides circle model quantization");
+ 
++  arser.add_argument("--version")
++      .nargs(0)
++      .required(false)
++      .default_value(false)
++      .help("Show version information and exit")
++      .exit_with(print_version);
++
+   arser.add_argument(qdqw)
+       .nargs(3)
+       .type(arser::DataType::STR_VEC)
+       .required(false)
+       .help("Quantize-dequantize weight values required action before quantization. "
+             "Three arguments required: input_dtype(float32) "
+-            "output_dtype(uint8) granularity(layer)");
++            "output_dtype(uint8) granularity(layer, channel)");
+ 
+   arser.add_argument(qwmm)
+       .nargs(3)
+@@ -63,7 +77,7 @@ int entry(int argc, char **argv)
+       .required(false)
+       .help("Quantize with min/max values. "
+             "Three arguments required: input_dtype(float32) "
+-            "output_dtype(uint8) granularity(layer)");
++            "output_dtype(uint8) granularity(layer, channel)");
+ 
+   arser.add_argument("input").nargs(1).type(arser::DataType::STR).help("Input circle model");
+   arser.add_argument("output").nargs(1).type(arser::DataType::STR).help("Output circle model");
+diff --git a/compiler/circle-tensordump/driver/Driver.cpp b/compiler/circle-tensordump/driver/Driver.cpp
+index a55cd45..38e3073 100644
+--- a/compiler/circle-tensordump/driver/Driver.cpp
++++ b/compiler/circle-tensordump/driver/Driver.cpp
+@@ -46,7 +46,7 @@ int entry(int argc, char **argv)
+   {
+     std::cout << err.what() << std::endl;
+     std::cout << arser;
+-    return 0;
++    return 255;
+   }
+ 
+   std::unique_ptr<circletensordump::DumpInterface> dump;
+diff --git a/compiler/circle-tensordump/src/Dump.cpp b/compiler/circle-tensordump/src/Dump.cpp
+index dfa78f0..a8d3256 100644
+--- a/compiler/circle-tensordump/src/Dump.cpp
++++ b/compiler/circle-tensordump/src/Dump.cpp
+@@ -136,6 +136,7 @@ void DumpTensors::run(std::ostream &os, const circle::Model *model, const std::s
+         auto max = quant_param->max();
+         auto scale = quant_param->scale();
+         auto zero_point = quant_param->zero_point();
++        auto quantized_dimension = quant_param->quantized_dimension();
+ 
+         os << " " + print_format2 + "Â Â  âââ min        : ";
+         ::print_comma_sepearted(os, min);
+@@ -146,9 +147,11 @@ void DumpTensors::run(std::ostream &os, const circle::Model *model, const std::s
+         os << " " + print_format2 + "Â Â  âââ scale      : ";
+         ::print_comma_sepearted(os, scale);
+         os << std::endl;
+-        os << " " + print_format2 + "Â Â  âââ zero_point : ";
++        os << " " + print_format2 + "Â Â  âââ zero_point : ";
+         ::print_comma_sepearted(os, zero_point);
+         os << std::endl;
++        os << " " + print_format2 + "Â Â  âââ quantized_dimension : " << quantized_dimension;
++        os << std::endl;
+       }
+ 
+       // buffer
+@@ -229,7 +232,7 @@ std::vector<hsize_t> hdf5_dims_cast(const flatbuffers::Vector<T> *data,
+ }
+ 
+ /**
+- *  This function writes data to given hdf5 file like below.
++ *  This function writes vector data to given hdf5 file like below.
+  *
+  *  GROUP "group_name"
+  *   ã´DATATYPE "type"
+@@ -238,9 +241,9 @@ std::vector<hsize_t> hdf5_dims_cast(const flatbuffers::Vector<T> *data,
+  *   ã´DATA "data"
+  */
+ template <typename T>
+-void write_data_to_hdf5(H5::H5File &file, std::string &group_name, std::string dataset_name,
+-                        const H5::PredType &type, const flatbuffers::Vector<T> *data,
+-                        std::vector<hsize_t> dims)
++void write_vector_data_to_hdf5(H5::H5File &file, std::string &group_name, std::string dataset_name,
++                               const H5::PredType &type, const flatbuffers::Vector<T> *data,
++                               std::vector<hsize_t> dims)
+ {
+   if (data == nullptr)
+     return;
+@@ -250,6 +253,17 @@ void write_data_to_hdf5(H5::H5File &file, std::string &group_name, std::string d
+   dataset->write(data->data(), type);
+ }
+ 
++/// @brief This function writes scalar data to given hdf5 file
++template <typename T>
++void write_scalar_data_to_hdf5(H5::H5File &file, std::string &group_name, std::string dataset_name,
++                               const H5::PredType &type, T data)
++{
++  auto dataspace = std::make_unique<H5::DataSpace>(H5S_SCALAR);
++  auto dataset = std::make_unique<H5::DataSet>(
++      file.createDataSet(group_name + "/" + dataset_name, type, *dataspace));
++  dataset->write(&data, type);
++}
++
+ } // namespace
+ 
+ namespace circletensordump
+@@ -297,8 +311,9 @@ void DumpTensorsToHdf5::run(std::ostream &os, const circle::Model *model,
+       auto buff_data_ptr = reader.buffers()->Get(buff_idx)->data();
+       if (buff_data_ptr)
+       {
+-        ::write_data_to_hdf5(file, group_name, "weights", ::hdf5_dtype_cast(tensor->type()),
+-                             buff_data_ptr, ::hdf5_dims_cast(buff_data_ptr, tensor->shape()));
++        ::write_vector_data_to_hdf5(file, group_name, "weights", ::hdf5_dtype_cast(tensor->type()),
++                                    buff_data_ptr,
++                                    ::hdf5_dims_cast(buff_data_ptr, tensor->shape()));
+       }
+ 
+       // write quantization parameters
+@@ -306,17 +321,20 @@ void DumpTensorsToHdf5::run(std::ostream &os, const circle::Model *model,
+       if (quant_param)
+       {
+         auto min = quant_param->min();
+-        ::write_data_to_hdf5(file, group_name, "min", H5::PredType::NATIVE_FLOAT, min,
+-                             ::hdf5_dims_cast(min));
++        ::write_vector_data_to_hdf5(file, group_name, "min", H5::PredType::NATIVE_FLOAT, min,
++                                    ::hdf5_dims_cast(min));
+         auto max = quant_param->max();
+-        ::write_data_to_hdf5(file, group_name, "max", H5::PredType::NATIVE_FLOAT, max,
+-                             ::hdf5_dims_cast(max));
++        ::write_vector_data_to_hdf5(file, group_name, "max", H5::PredType::NATIVE_FLOAT, max,
++                                    ::hdf5_dims_cast(max));
+         auto scale = quant_param->scale();
+-        ::write_data_to_hdf5(file, group_name, "scale", H5::PredType::NATIVE_FLOAT, scale,
+-                             ::hdf5_dims_cast(scale));
++        ::write_vector_data_to_hdf5(file, group_name, "scale", H5::PredType::NATIVE_FLOAT, scale,
++                                    ::hdf5_dims_cast(scale));
+         auto zero_point = quant_param->zero_point();
+-        ::write_data_to_hdf5(file, group_name, "zero_point", H5::PredType::NATIVE_INT64, zero_point,
+-                             ::hdf5_dims_cast(zero_point));
++        ::write_vector_data_to_hdf5(file, group_name, "zero_point", H5::PredType::NATIVE_INT64,
++                                    zero_point, ::hdf5_dims_cast(zero_point));
++        auto quantized_dimension = quant_param->quantized_dimension();
++        ::write_scalar_data_to_hdf5(file, group_name, "quantized_dimension",
++                                    H5::PredType::NATIVE_INT32, quantized_dimension);
+       }
+     }
+   }
+diff --git a/compiler/circle-verify/src/Driver.cpp b/compiler/circle-verify/src/Driver.cpp
+index 1af31d9..7a44c65 100644
+--- a/compiler/circle-verify/src/Driver.cpp
++++ b/compiler/circle-verify/src/Driver.cpp
+@@ -35,7 +35,7 @@ int entry(int argc, char **argv)
+   {
+     std::cout << err.what() << std::endl;
+     std::cout << arser;
+-    return 0;
++    return 255;
+   }
+ 
+   auto verifier = std::make_unique<VerifyFlatbuffers>();
+diff --git a/compiler/circle2circle-dredd-recipe-test/CMakeLists.txt b/compiler/circle2circle-dredd-recipe-test/CMakeLists.txt
+index 6663cb9..4bcaae3 100644
+--- a/compiler/circle2circle-dredd-recipe-test/CMakeLists.txt
++++ b/compiler/circle2circle-dredd-recipe-test/CMakeLists.txt
+@@ -1,25 +1,12 @@
+ nnas_include(TargetRequire)
+ 
+ unset(REQUIRED_TARGETS)
+-list(APPEND REQUIRED_TARGETS circlechef)
+ list(APPEND REQUIRED_TARGETS circle-inspect)
+ list(APPEND REQUIRED_TARGETS circle-verify)
+ list(APPEND REQUIRED_TARGETS circle2circle)
+ list(APPEND REQUIRED_TARGETS dredd_rule_lib)
+-list(APPEND REQUIRED_TARGETS tflchef)
+-list(APPEND REQUIRED_TARGETS tflite2circle)
+ TargetRequire_Return(${REQUIRED_TARGETS})
+ 
+-nncc_find_resource(TensorFlowLiteRecipes)
+-nncc_find_resource(CircleRecipes)
+-
+-set(TFLITE_RECIPE_REPO "${TensorFlowLiteRecipes_DIR}")
+-set(CIRCLE_RECIPE_REPO "${CircleRecipes_DIR}")
+-unset(RECIPE_REPO)
+-
+-set(TEST_RECIPE_FILENAME "test.recipe")
+-set(TEST_RULE_FILENAME "test.rule")
+-
+ unset(TEST_DEPS)
+ unset(TEST_NAMES)
+ 
+@@ -27,21 +14,9 @@ set(options "")
+ set(oneValueArgs "")
+ set(multiValueArgs PASS)
+ 
+-macro(Add RECIPE)
+-  if(NOT EXISTS "${TFLITE_RECIPE_REPO}/${RECIPE}/test.recipe")
+-    if(NOT EXISTS "${CIRCLE_RECIPE_REPO}/${RECIPE}/test.recipe")
+-      message(FATAL_ERROR "Missing recipe of '${RECIPE}' test")
+-    else()
+-      set(RECIPE_REPO ${CIRCLE_RECIPE_REPO})
+-    endif()
+-  else()
+-    set(RECIPE_REPO ${TFLITE_RECIPE_REPO})
+-  endif()
+-
+-  if(NOT EXISTS "${RECIPE_REPO}/${RECIPE}/test.rule")
+-    message(FATAL_ERROR "Missing rule of '${RECIPE}' test")
+-  endif()
++get_target_property(ARTIFACTS_BIN_PATH testDataGenerator BINARY_DIR)
+ 
++macro(Add RECIPE)
+   cmake_parse_arguments(ARG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+   unset(OPT_OPTIONS)
+   foreach(src ${ARG_PASS})
+@@ -49,71 +24,20 @@ macro(Add RECIPE)
+     list(APPEND OPT_OPTIONS "--${src}")
+   endforeach(src ${ARG_PASS})
+ 
+-  set(RECIPE_FILE "${RECIPE}.recipe")
+-  set(RECIPE_SOURCE_PATH "${RECIPE_REPO}/${RECIPE}/${TEST_RECIPE_FILENAME}")
+-  set(RECIPE_BINARY_PATH "${CMAKE_CURRENT_BINARY_DIR}/${RECIPE_FILE}")
+-  
+-  set(RULE_FILE "${RECIPE}.rule")
+-  set(RULE_SOURCE_PATH "${RECIPE_REPO}/${RECIPE}/${TEST_RULE_FILENAME}")
+-  set(RULE_BINARY_PATH "${CMAKE_CURRENT_BINARY_DIR}/${RULE_FILE}")
+-
+-  set(TFLITE_FILE "${RECIPE}.tflite")
+-  set(TFLITE_OUTPUT_PATH "${CMAKE_CURRENT_BINARY_DIR}/${TFLITE_FILE}")
+-
+   set(CIRCLE_FILE "${RECIPE}.circle")
+-  set(CIRCLE_OUTPUT_PATH "${CMAKE_CURRENT_BINARY_DIR}/${CIRCLE_FILE}")
++  set(CIRCLE_PATH "${ARTIFACTS_BIN_PATH}/${CIRCLE_FILE}")
+ 
+   set(OPT_CIRCLE_FILE "${RECIPE}.opt.circle")
+   set(OPT_CIRCLE_OUTPUT_PATH "${CMAKE_CURRENT_BINARY_DIR}/${OPT_CIRCLE_FILE}")
+ 
+-  # Copy .recipe
+-  add_custom_command(OUTPUT ${RECIPE_BINARY_PATH}
+-    COMMAND ${CMAKE_COMMAND} -E copy "${RECIPE_SOURCE_PATH}" "${RECIPE_BINARY_PATH}"
+-    DEPENDS ${RECIPE_SOURCE_PATH}
+-    COMMENT "Generate ${RECIPE_FILE}"
+-  )
+-
+-  # Copy .rule
+-  add_custom_command(OUTPUT ${RULE_BINARY_PATH}
+-    COMMAND ${CMAKE_COMMAND} -E copy "${RULE_SOURCE_PATH}" "${RULE_BINARY_PATH}"
+-    DEPENDS ${RULE_SOURCE_PATH}
+-    COMMENT "Generate ${RULE_FILE}"
+-  )
+-
+-  if(${RECIPE_REPO} STREQUAL ${TFLITE_RECIPE_REPO})
+-    # Generate .tflite
+-    add_custom_command(OUTPUT ${TFLITE_OUTPUT_PATH}
+-      COMMAND $<TARGET_FILE:tflchef-file> ${RECIPE_BINARY_PATH} ${TFLITE_OUTPUT_PATH}
+-      DEPENDS $<TARGET_FILE:tflchef-file> ${RECIPE_BINARY_PATH}
+-      COMMENT "Generate ${TFLITE_FILE}"
+-    )
+-
+-    # Generate .circle
+-    add_custom_command(OUTPUT ${CIRCLE_OUTPUT_PATH}
+-      COMMAND $<TARGET_FILE:tflite2circle> ${TFLITE_OUTPUT_PATH} ${CIRCLE_OUTPUT_PATH}
+-      DEPENDS $<TARGET_FILE:tflite2circle> ${TFLITE_OUTPUT_PATH}
+-      COMMENT "Generate ${CIRCLE_FILE}"
+-    )
+-
+-    list(APPEND TEST_DEPS ${TFLITE_OUTPUT_PATH})
+-  else()
+-   # Generate .circle
+-    add_custom_command(OUTPUT ${CIRCLE_OUTPUT_PATH}
+-      COMMAND $<TARGET_FILE:circlechef-file> ${RECIPE_BINARY_PATH} ${CIRCLE_OUTPUT_PATH}
+-      DEPENDS $<TARGET_FILE:circlechef-file> ${RECIPE_BINARY_PATH}
+-      COMMENT "Generate ${CIRCLE_FILE}"
+-    )
+-  endif()
+-
+   # Generate optimized .circle
+   add_custom_command(OUTPUT ${OPT_CIRCLE_OUTPUT_PATH}
+-    COMMAND $<TARGET_FILE:circle2circle> ${OPT_OPTIONS} ${CIRCLE_OUTPUT_PATH} ${OPT_CIRCLE_OUTPUT_PATH}
+-    DEPENDS $<TARGET_FILE:circle2circle> ${CIRCLE_OUTPUT_PATH}
++    COMMAND $<TARGET_FILE:circle2circle> ${OPT_OPTIONS} ${CIRCLE_PATH} ${OPT_CIRCLE_OUTPUT_PATH}
++    DEPENDS $<TARGET_FILE:circle2circle> ${CIRCLE_PATH}
+     COMMENT "Generate ${OPT_CIRCLE_FILE}"
+   )
+ 
+-  list(APPEND TEST_DEPS ${RECIPE_BINARY_PATH} ${RULE_BINARY_PATH}
+-                        ${CIRCLE_OUTPUT_PATH} ${OPT_CIRCLE_OUTPUT_PATH})
++  list(APPEND TEST_DEPS ${OPT_CIRCLE_OUTPUT_PATH})
+   list(APPEND TEST_NAMES ${RECIPE})
+ endmacro(Add)
+ 
+@@ -174,12 +98,15 @@ list(APPEND TEST_DEPS "${RULE_LIB_BINARY_PATH}")
+ 
+ # Generate dependencies
+ add_custom_target(circle2circle_dredd_recipe_test ALL DEPENDS ${TEST_DEPS})
++add_dependencies(circle2circle_dredd_recipe_test common_artifacts_deps)
++
++get_target_property(ARTIFACTS_BIN_PATH testDataGenerator BINARY_DIR)
+ 
+ # Run tests
+ add_test(
+   NAME circle2circle_dredd_recipe_test
+   COMMAND "${TEST_RUNNER}"
+           "${TEST_CONFIG}"
+-          "${CMAKE_CURRENT_BINARY_DIR}"
++          "${ARTIFACTS_BIN_PATH}"
+           ${TEST_NAMES}
+ )
+diff --git a/compiler/circle2circle-dredd-recipe-test/requires.cmake b/compiler/circle2circle-dredd-recipe-test/requires.cmake
+index e4a5b71..70e7c52 100644
+--- a/compiler/circle2circle-dredd-recipe-test/requires.cmake
++++ b/compiler/circle2circle-dredd-recipe-test/requires.cmake
+@@ -1,7 +1,5 @@
+-require("circlechef")
+ require("circle2circle")
+ require("circle-inspect")
+ require("circle-verify")
++require("common-artifacts")
+ require("dredd-rule-lib")
+-require("tflchef")
+-require("tflite2circle")
+diff --git a/compiler/circle2circle-dredd-recipe-test/test.lst b/compiler/circle2circle-dredd-recipe-test/test.lst
+index 202f669..6328a64 100644
+--- a/compiler/circle2circle-dredd-recipe-test/test.lst
++++ b/compiler/circle2circle-dredd-recipe-test/test.lst
+@@ -11,9 +11,10 @@
+ ## TFLITE RECIPE
+ 
+ Add(Net_InstanceNorm_001 PASS fuse_instnorm)
+-# Add(Net_InstanceNorm_002 PASS fuse_instnorm)
++Add(Net_InstanceNorm_002 PASS fuse_instnorm)
+ Add(BatchMatMulV2_000 PASS resolve_customop_batchmatmul)
+ Add(MatMul_000 PASS resolve_customop_matmul)
++Add(DepthwiseConv2D_003 PASS)
+ 
+ ## CIRCLE RECIPE
+ 
+diff --git a/compiler/circle2circle-dredd-recipe-test/testall.sh b/compiler/circle2circle-dredd-recipe-test/testall.sh
+index 33a2036..2899587 100755
+--- a/compiler/circle2circle-dredd-recipe-test/testall.sh
++++ b/compiler/circle2circle-dredd-recipe-test/testall.sh
+@@ -13,21 +13,22 @@ if [[ $# -lt 2 ]]; then
+   exit 255
+ fi
+ 
++WORKDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+ CONFIG_PATH="$1"; shift
+-WORKDIR="$1"; shift
++RESOURCE_DIR="$1"; shift
+ 
+ source "${CONFIG_PATH}"
+ 
+ echo "-- Found circle-inspect: ${CIRCLE_INSPECT_PATH}"
+ echo "-- Found circle-verify: ${CIRCLE_VERIFY_PATH}"
+ echo "-- Found circle2circle: ${CIRCLE2CIRCLE_PATH}"
+-echo "-- Found workdir: ${WORKDIR}"
++echo "-- Found common-artifacts: ${RESOURCE_DIR}"
+ 
+ TESTED=()
+ PASSED=()
+ FAILED=()
+ 
+-pushd "${WORKDIR}"
++pushd ${WORKDIR}
+ while [[ $# -ne 0 ]]; do
+   PREFIX="$1"; shift
+ 
+@@ -40,7 +41,7 @@ while [[ $# -ne 0 ]]; do
+   cat > "${PREFIX}.log" <(
+     exec 2>&1
+ 
+-    echo "-- Found tflite: ${PREFIX}.tflite"
++    echo "-- Found circle: ${PREFIX}.opt.circle"
+ 
+     # Exit immediately if any command fails
+     set -e
+@@ -55,7 +56,7 @@ while [[ $# -ne 0 ]]; do
+     set +x
+ 
+     # (COMPILED_FILE, INSPECT_PROG_PATH, VERIFY_PROG_PATH, ERROR_LOG) must be set for rule-lib.sh
+-    COMPILED_FILE="${WORKDIR}/${PREFIX}.opt.circle"
++    COMPILED_FILE="${PREFIX}.opt.circle"
+     INSPECT_PROG_PATH=${CIRCLE_INSPECT_PATH}
+     VERIFY_PROG_PATH=${CIRCLE_VERIFY_PATH}
+     ERROR_LOG="${PREFIX}.error"
+@@ -66,7 +67,7 @@ while [[ $# -ne 0 ]]; do
+     trap 'echo "** ERROR **" ; cat "${ERROR_LOG}"' ERR
+ 
+     source rule-lib.sh
+-    source "${PREFIX}.rule"
++    source "${RESOURCE_DIR}/${PREFIX}.rule"
+ 
+     # unset
+     trap - ERR
+diff --git a/compiler/circle2circle/CMakeLists.txt b/compiler/circle2circle/CMakeLists.txt
+index 7b2bf9b..f60c896 100644
+--- a/compiler/circle2circle/CMakeLists.txt
++++ b/compiler/circle2circle/CMakeLists.txt
+@@ -19,6 +19,7 @@ target_link_libraries(circle2circle luci_service)
+ target_link_libraries(circle2circle luci_pass)
+ target_link_libraries(circle2circle luci_export)
+ target_link_libraries(circle2circle arser)
++target_link_libraries(circle2circle vconone)
+ 
+ install(TARGETS circle2circle DESTINATION bin)
+ 
+@@ -44,3 +45,4 @@ target_link_libraries(circle2circle_test luci_service)
+ target_link_libraries(circle2circle_test luci_pass)
+ target_link_libraries(circle2circle_test luci_export)
+ target_link_libraries(circle2circle_test arser)
++target_link_libraries(circle2circle_test vconone)
+diff --git a/compiler/circle2circle/requires.cmake b/compiler/circle2circle/requires.cmake
+index 8cbb90d..36a9efd 100644
+--- a/compiler/circle2circle/requires.cmake
++++ b/compiler/circle2circle/requires.cmake
+@@ -9,3 +9,4 @@ require("hermes")
+ require("hermes-std")
+ require("luci")
+ require("arser")
++require("vconone")
+diff --git a/compiler/circle2circle/src/Circle2Circle.cpp b/compiler/circle2circle/src/Circle2Circle.cpp
+index 6888d26..849597b 100644
+--- a/compiler/circle2circle/src/Circle2Circle.cpp
++++ b/compiler/circle2circle/src/Circle2Circle.cpp
+@@ -26,6 +26,7 @@
+ 
+ #include <oops/InternalExn.h>
+ #include <arser/arser.h>
++#include <vconone/vconone.h>
+ 
+ #include <functional>
+ #include <iostream>
+@@ -34,6 +35,12 @@
+ using Algorithms = luci::CircleOptimizer::Options::Algorithm;
+ using AlgorithmParameters = luci::CircleOptimizer::Options::AlgorithmParameters;
+ 
++void print_version(void)
++{
++  std::cout << "circle2circle version " << vconone::get_string() << std::endl;
++  std::cout << vconone::get_copyright() << std::endl;
++}
++
+ int entry(int argc, char **argv)
+ {
+   // Simple argument parser (based on map)
+@@ -44,6 +51,13 @@ int entry(int argc, char **argv)
+ 
+   arser::Arser arser("circle2circle provides circle model optimization and transformations");
+ 
++  arser.add_argument("--version")
++      .nargs(0)
++      .required(false)
++      .default_value(false)
++      .help("Show version information and exit")
++      .exit_with(print_version);
++
+   arser.add_argument("--all").nargs(0).required(false).default_value(false).help(
+       "Enable all optimize options");
+ 
+diff --git a/compiler/circlechef/CMakeLists.txt b/compiler/circlechef/CMakeLists.txt
+index cba7d0a..3e2ddcb 100644
+--- a/compiler/circlechef/CMakeLists.txt
++++ b/compiler/circlechef/CMakeLists.txt
+@@ -18,4 +18,6 @@ add_subdirectory(core)
+ add_subdirectory(circle)
+ # Tools
+ add_subdirectory(tools)
+-add_subdirectory(tests)
++if(ENABLE_TEST)
++  add_subdirectory(tests)
++endif(ENABLE_TEST)
+diff --git a/compiler/circlechef/circle/src/RecipeChef.cpp b/compiler/circlechef/circle/src/RecipeChef.cpp
+index 17ef1be..51326c7 100644
+--- a/compiler/circlechef/circle/src/RecipeChef.cpp
++++ b/compiler/circlechef/circle/src/RecipeChef.cpp
+@@ -181,6 +181,8 @@ std::unique_ptr<ModelRecipe> generate_recipe(const circle::Model *model)
+         for (uint32_t idx = 0; idx < quant->zero_point()->size(); ++idx)
+           chef_quant->add_zero_point(quant->zero_point()->Get(idx));
+       }
++      circlechef::TensorQuantization *chef_quant = operand->mutable_quant();
++      chef_quant->set_quantized_dimension(quant->quantized_dimension());
+     }
+   }
+ 
+diff --git a/compiler/circlechef/core/src/ModelChef.cpp b/compiler/circlechef/core/src/ModelChef.cpp
+index 76aeacd..d81467d 100644
+--- a/compiler/circlechef/core/src/ModelChef.cpp
++++ b/compiler/circlechef/core/src/ModelChef.cpp
+@@ -413,6 +413,7 @@ template <typename T> void cook_graph(const T &graph, CookParams &cp)
+       quant_builder.add_min(quant_min);
+       quant_builder.add_scale(quant_scale);
+       quant_builder.add_zero_point(quant_zero_point);
++      quant_builder.add_quantized_dimension(quant.quantized_dimension());
+ 
+       // Update QuantizationParameters Index
+       quant_index = quant_builder.Finish();
+diff --git a/compiler/circlechef/proto/circlechef.proto b/compiler/circlechef/proto/circlechef.proto
+index b8c009b..3e5e6b1 100644
+--- a/compiler/circlechef/proto/circlechef.proto
++++ b/compiler/circlechef/proto/circlechef.proto
+@@ -35,6 +35,7 @@ message TensorQuantization {
+   repeated float max = 2;
+   repeated float scale = 3;
+   repeated int64 zero_point = 4;
++  optional int32 quantized_dimension = 5 [default = 0];
+ }
+ 
+ message Operand {
+diff --git a/compiler/circlechef/tools/file/Driver.cpp b/compiler/circlechef/tools/file/Driver.cpp
+index a15da40..bcc0c7a 100644
+--- a/compiler/circlechef/tools/file/Driver.cpp
++++ b/compiler/circlechef/tools/file/Driver.cpp
+@@ -41,7 +41,7 @@ int entry(int argc, char **argv)
+   {
+     std::cout << err.what() << std::endl;
+     std::cout << arser;
+-    return 0;
++    return 255;
+   }
+ 
+   int32_t model_version = 1;
+diff --git a/compiler/circlechef/tools/reverse/Driver.cpp b/compiler/circlechef/tools/reverse/Driver.cpp
+index 9c0b9ea..8a2b85f 100644
+--- a/compiler/circlechef/tools/reverse/Driver.cpp
++++ b/compiler/circlechef/tools/reverse/Driver.cpp
+@@ -38,7 +38,7 @@ int entry(int argc, char **argv)
+   {
+     std::cout << err.what() << std::endl;
+     std::cout << arser;
+-    return 0;
++    return 255;
+   }
+ 
+   std::string circle_path = arser.get<std::string>("circle");
+diff --git a/compiler/circledump/driver/Driver.cpp b/compiler/circledump/driver/Driver.cpp
+index b8f561f..657f24f 100644
+--- a/compiler/circledump/driver/Driver.cpp
++++ b/compiler/circledump/driver/Driver.cpp
+@@ -33,7 +33,7 @@ int entry(int argc, char **argv)
+   {
+     std::cout << err.what() << '\n';
+     std::cout << arser;
+-    return 0;
++    return 255;
+   }
+ 
+   std::string circle_path = arser.get<std::string>("circle");
+diff --git a/compiler/circledump/src/OpPrinter.cpp b/compiler/circledump/src/OpPrinter.cpp
+index 2c03203..5aa5d51 100644
+--- a/compiler/circledump/src/OpPrinter.cpp
++++ b/compiler/circledump/src/OpPrinter.cpp
+@@ -593,6 +593,20 @@ public:
+   }
+ };
+ 
++class UniquePrinter : public OpPrinter
++{
++public:
++  void options(const circle::Operator *op, std::ostream &os) const override
++  {
++    if (auto *params = op->builtin_options_as_UniqueOptions())
++    {
++      os << "    ";
++      os << "idx_out_type(" << EnumNameTensorType(params->idx_out_type()) << ") ";
++      os << std::endl;
++    }
++  }
++};
++
+ class WhilePrinter : public OpPrinter
+ {
+ public:
+@@ -744,6 +758,7 @@ OpPrinterRegistry::OpPrinterRegistry()
+   _op_map[circle::BuiltinOperator_SUM] = make_unique<ReducerPrinter>();
+   _op_map[circle::BuiltinOperator_TRANSPOSE_CONV] = make_unique<TransposeConvPrinter>();
+   // There is no Option for TOPK_V2
++  _op_map[circle::BuiltinOperator_UNIQUE] = make_unique<UniquePrinter>();
+   _op_map[circle::BuiltinOperator_WHILE] = make_unique<WhilePrinter>();
+   _op_map[circle::BuiltinOperator_CUSTOM] = make_unique<CustomOpPrinter>();
+ 
+diff --git a/compiler/common-artifacts/exclude.lst b/compiler/common-artifacts/exclude.lst
+index b614b71..d3f5601 100644
+--- a/compiler/common-artifacts/exclude.lst
++++ b/compiler/common-artifacts/exclude.lst
+@@ -5,9 +5,12 @@
+ 
+ #[[ optimize : Exclude from circle optimization(circle2circle) ]]
+ ## TensorFlowLiteRecipes
+-optimize(ReLU6_000)
+-optimize(Where_000)
+-optimize(Where_001)
++optimize(Unique_000)
++optimize(Unique_001)
++optimize(Unique_002)
++optimize(Unique_003)
++optimize(Unique_U8_000)
++optimize(Unique_U8_001)
+ 
+ ## CircleRecipes
+ 
+@@ -46,6 +49,7 @@ tcgenerate(DepthToSpace_000)
+ tcgenerate(DepthwiseConv2D_001) # runtime doesn't support dilation
+ tcgenerate(DepthwiseConv2D_003) # runtime doesn't support dilation
+ tcgenerate(DepthwiseConv2D_U8_000)
++tcgenerate(DepthwiseConv2D_U8_001)  # luci-interpreter doesn't support channel-wise quantization yet
+ tcgenerate(Div_000)
+ tcgenerate(ELU_000)
+ tcgenerate(Equal_000)
+@@ -96,7 +100,7 @@ tcgenerate(Neg_000)
+ tcgenerate(Net_Dangle_001)
+ tcgenerate(Net_InstanceNorm_001)
+ tcgenerate(Net_InstanceNorm_002)
+-tcgenerate(Net_ZeroDim_001) # fix luci
++tcgenerate(Net_ZeroDim_001) # luci-interpreter doesn't support zero dim
+ tcgenerate(NotEqual_000)
+ tcgenerate(OneHot_000)
+ tcgenerate(OneHot_001)
+@@ -120,9 +124,9 @@ tcgenerate(ReduceProd_001)
+ tcgenerate(ReduceProd_002)
+ tcgenerate(ReduceProd_003)
+ tcgenerate(ReLU_000)
+-tcgenerate(ReLU6_000) # luci NYI
++tcgenerate(ReLU6_000)
+ tcgenerate(ReLUN1To1_000)
+-tcgenerate(Reshape_003) # fix luci
++tcgenerate(Reshape_003) # luci-interpreter doesn't support reshape without built-in option
+ tcgenerate(Reshape_U8_000)
+ tcgenerate(ResizeBilinear_000)
+ tcgenerate(ResizeNearestNeighbor_000)
+@@ -148,7 +152,7 @@ tcgenerate(SpaceToBatchND_002)
+ tcgenerate(SpaceToBatchND_003)
+ tcgenerate(SpaceToDepth_000)
+ tcgenerate(SparseToDense_000)
+-tcgenerate(SplitV_000) # fix luci
++tcgenerate(SplitV_000)
+ tcgenerate(Sqrt_000)
+ tcgenerate(Square_000)
+ tcgenerate(SquaredDifference_000)
+@@ -164,22 +168,21 @@ tcgenerate(Sum_001)
+ tcgenerate(Tanh_000)
+ tcgenerate(Tile_000)
+ tcgenerate(Tile_U8_000)
+-tcgenerate(TopKV2_000) # fix luci
+-tcgenerate(TopKV2_001) # fix luci
+-tcgenerate(TransposeConv_000) # fix interpreter
++tcgenerate(TopKV2_000)
++tcgenerate(TopKV2_001)
+ tcgenerate(Unique_000)
+ tcgenerate(Unique_001)
+ tcgenerate(Unique_002)
+ tcgenerate(Unique_003)
+ tcgenerate(Unique_U8_000)
+ tcgenerate(Unique_U8_001)
+-tcgenerate(Where_000) # luci NYI
+-tcgenerate(Where_001) # luci NYI
+-tcgenerate(While_000) # fix luci
++tcgenerate(Where_000)
++tcgenerate(Where_001)
++tcgenerate(While_000)
+ tcgenerate(While_001)
+ tcgenerate(While_002)
+ tcgenerate(While_003)
+-tcgenerate(YUV_TO_RGB_000) # fix luci
++tcgenerate(YUV_TO_RGB_000)
+ tcgenerate(YUV_TO_RGB_U8_000)
+ tcgenerate(ZerosLike_000)
+ 
+diff --git a/compiler/hermes/src/hermes.test.cpp b/compiler/hermes/src/hermes.test.cpp
+index 2cbc093..ea7ef65 100644
+--- a/compiler/hermes/src/hermes.test.cpp
++++ b/compiler/hermes/src/hermes.test.cpp
+@@ -18,7 +18,28 @@
+ 
+ #include <gtest/gtest.h>
+ 
+-TEST(HermesTest, simple_usecase)
++namespace
+ {
+-  // TO BE FILLED
++
++class Logger final : public hermes::Source
++{
++public:
++  Logger(hermes::Context *ctx);
++  ~Logger();
++};
++
++Logger::Logger(hermes::Context *ctx) { activate(ctx->sources(), ctx->bus()); }
++Logger::~Logger() { deactivate(); }
++
++} // namespace
++
++TEST(HermesTest, logger_constructor_NEG)
++{
++  hermes::Context context;
++  // we expect segmentfault from nullptr->sources()
++  ASSERT_DEATH(Logger logger(&context), "");
++
++  SUCCEED();
+ }
++
++// TODO add HermesTest simple_usecase
+diff --git a/compiler/locomotiv/src/Node/BiasEncode.test.cpp b/compiler/locomotiv/src/Node/BiasEncode.test.cpp
+index cdb255c..4680f5c 100644
+--- a/compiler/locomotiv/src/Node/BiasEncode.test.cpp
++++ b/compiler/locomotiv/src/Node/BiasEncode.test.cpp
+@@ -90,6 +90,16 @@ template <typename T> void test()
+ }
+ } // namespace
+ 
+-TEST(NodeExecution_BiasEncode, s32) { test<int32_t>(); }
++TEST(NodeExecution_BiasEncode, s32)
++{
++  test<int32_t>();
++
++  SUCCEED();
++}
+ 
+-TEST(NodeExecution_BiasEncode, f32) { test<float>(); }
++TEST(NodeExecution_BiasEncode, f32)
++{
++  test<float>();
++
++  SUCCEED();
++}
+diff --git a/compiler/locomotiv/src/Node/MatMul.test.cpp b/compiler/locomotiv/src/Node/MatMul.test.cpp
+index f1f3a52..7d942e1 100644
+--- a/compiler/locomotiv/src/Node/MatMul.test.cpp
++++ b/compiler/locomotiv/src/Node/MatMul.test.cpp
+@@ -142,6 +142,8 @@ TEST(NodeExecution_MatMul, f32_2x3_3x3)
+   };
+ 
+   run_test<float>(lhs, rhs, out, Shape{2, 3}, Shape{3, 3}, Shape{2, 3}, loco::DataType::FLOAT32);
++
++  SUCCEED();
+ }
+ 
+ /* from the code below:
+@@ -183,6 +185,8 @@ TEST(NodeExecution_MatMul, s32_4x2_2x6)
+   };
+ 
+   run_test<int32_t>(lhs, rhs, out, Shape{4, 2}, Shape{2, 6}, Shape{4, 6}, loco::DataType::S32);
++
++  SUCCEED();
+ }
+ 
+ // clang-format on
+diff --git a/compiler/locop/src/FormattedGraph.test.cpp b/compiler/locop/src/FormattedGraph.test.cpp
+index c9808d3..aff9ebe 100644
+--- a/compiler/locop/src/FormattedGraph.test.cpp
++++ b/compiler/locop/src/FormattedGraph.test.cpp
+@@ -28,6 +28,8 @@ TEST(LinearV1FormatterTest, simple)
+ 
+   // TODO Validate the output (when the implementation becomes stable)
+   std::cout << locop::fmt<locop::LinearV1>(g) << std::endl;
++
++  SUCCEED();
+ }
+ 
+ TEST(LinearV1FormatterTest, user_defined_node_summary_builder)
+diff --git a/compiler/locop/src/FormattedTensorShape.test.cpp b/compiler/locop/src/FormattedTensorShape.test.cpp
+index 0f0017a..fc85df3 100644
+--- a/compiler/locop/src/FormattedTensorShape.test.cpp
++++ b/compiler/locop/src/FormattedTensorShape.test.cpp
+@@ -30,4 +30,6 @@ TEST(FormattedTensorShapeTest, BracketFormat)
+   tensor_shape->dim(0) = 4;
+ 
+   std::cout << fmt<TensorShapeFormat::Bracket>(tensor_shape.get()) << std::endl;
++
++  SUCCEED();
+ }
+diff --git a/compiler/luci-interpreter/include/luci_interpreter/core/Tensor.h b/compiler/luci-interpreter/include/luci_interpreter/core/Tensor.h
+index 9987898..4ac3d86 100644
+--- a/compiler/luci-interpreter/include/luci_interpreter/core/Tensor.h
++++ b/compiler/luci-interpreter/include/luci_interpreter/core/Tensor.h
+@@ -79,12 +79,11 @@ private:
+ //
+ // Note that due to historical and performance reasons, per-tensor quantization uses unsigned
+ // integer types, while per-channel uses signed types assuming 'zero_point' == 0.
+-//
+-// TODO Add 'quantized_dimension' field for per-channel case when IR provides it.
+ struct AffineQuantization
+ {
+   std::vector<float> scale;
+   std::vector<int32_t> zero_point;
++  int32_t quantized_dimension;
+ };
+ 
+ class Tensor
+@@ -108,6 +107,12 @@ public:
+     return _quantization.zero_point[0];
+   }
+ 
++  const std::vector<float> &scales() const { return _quantization.scale; }
++
++  const std::vector<int32_t> &zero_points() const { return _quantization.zero_point; }
++
++  int32_t quantized_dimension() const { return _quantization.quantized_dimension; }
++
+   template <typename T> const T *data() const { return reinterpret_cast<const T *>(_data.get()); }
+ 
+   template <typename T> T *data() { return reinterpret_cast<T *>(_data.get()); }
+diff --git a/compiler/luci-interpreter/src/core/KernelParams.h b/compiler/luci-interpreter/src/core/KernelParams.h
+index a32e0d4..65d1197 100644
+--- a/compiler/luci-interpreter/src/core/KernelParams.h
++++ b/compiler/luci-interpreter/src/core/KernelParams.h
+@@ -56,6 +56,11 @@ struct Conv2DParams
+   Activation activation;
+ };
+ 
++struct DepthToSpaceParams
++{
++  int block_size;
++};
++
+ struct DepthwiseConv2DParams
+ {
+   Padding padding;
+diff --git a/compiler/luci-interpreter/src/kernels/CMakeLists.txt b/compiler/luci-interpreter/src/kernels/CMakeLists.txt
+index fe36231..a1fd1de 100644
+--- a/compiler/luci-interpreter/src/kernels/CMakeLists.txt
++++ b/compiler/luci-interpreter/src/kernels/CMakeLists.txt
+@@ -12,6 +12,8 @@ set(SOURCES
+     Concatenation.cpp
+     Conv2D.h
+     Conv2D.cpp
++    DepthToSpace.h
++    DepthToSpace.cpp
+     DepthwiseConv2D.h
+     DepthwiseConv2D.cpp
+     Elu.h
+@@ -40,6 +42,10 @@ set(SOURCES
+     Pad.cpp
+     Reshape.h
+     Reshape.cpp
++    Reverse.h
++    Reverse.cpp
++    Slice.h
++    Slice.cpp
+     Softmax.h
+     Softmax.cpp
+     SpaceToDepth.h
+@@ -77,6 +83,7 @@ set(TEST_SOURCES
+     AveragePool2D.test.cpp
+     Concatenation.test.cpp
+     Conv2D.test.cpp
++    DepthToSpace.test.cpp
+     DepthwiseConv2D.test.cpp
+     Elu.test.cpp
+     FullyConnected.test.cpp
+@@ -91,6 +98,8 @@ set(TEST_SOURCES
+     Mul.test.cpp
+     Pad.test.cpp
+     Reshape.test.cpp
++    Reverse.test.cpp
++    Slice.test.cpp
+     Softmax.test.cpp
+     SpaceToDepth.test.cpp
+     Split.test.cpp
+diff --git a/compiler/luci-interpreter/src/kernels/DepthToSpace.cpp b/compiler/luci-interpreter/src/kernels/DepthToSpace.cpp
+new file mode 100644
+index 0000000..cab63e2
+--- /dev/null
++++ b/compiler/luci-interpreter/src/kernels/DepthToSpace.cpp
+@@ -0,0 +1,90 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *    http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#include "DepthToSpace.h"
++#include "Utils.h"
++#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
++
++namespace luci_interpreter
++{
++namespace kernels
++{
++
++DepthToSpace::DepthToSpace(const Tensor *input, Tensor *output, const DepthToSpaceParams &params)
++    : KernelWithParams<DepthToSpaceParams>({input}, {output}, params)
++{
++}
++
++void DepthToSpace::configure()
++{
++  if (input()->shape().num_dims() != 4)
++  {
++    throw std::runtime_error("Invalid input num_dims.");
++  }
++  if (output()->element_type() != DataType::FLOAT32 && output()->element_type() != DataType::U8 &&
++      output()->element_type() != DataType::S8 && output()->element_type() != DataType::S32 &&
++      output()->element_type() != DataType::S64)
++  {
++    throw std::runtime_error("Invalid output type");
++  }
++  if (input()->element_type() != output()->element_type())
++  {
++    throw std::runtime_error("Type mismatch on input and output.");
++  }
++  const int block_size = params().block_size;
++  const int32_t input_height = input()->shape().dim(1);
++  const int32_t input_width = input()->shape().dim(2);
++  const int32_t input_channels = input()->shape().dim(3);
++  int32_t output_height = input_height * block_size;
++  int32_t output_width = input_width * block_size;
++  int32_t output_channels = input_channels / block_size / block_size;
++
++  assert(input_height == output_height / block_size);
++  assert(input_width == output_width / block_size);
++  assert(input_channels == output_channels * block_size * block_size);
++
++  Shape output_shape(4);
++  output_shape.dim(0) = input()->shape().dim(0);
++  output_shape.dim(1) = output_height;
++  output_shape.dim(2) = output_width;
++  output_shape.dim(3) = output_channels;
++
++  output()->resize(output_shape);
++}
++
++void DepthToSpace::execute() const
++{
++  tflite::DepthToSpaceParams op_params;
++  op_params.block_size = params().block_size;
++  switch (input()->element_type())
++  {
++    case DataType::FLOAT32:
++      tflite::optimized_ops::DepthToSpace(op_params, getTensorShape(input()),
++                                          getTensorData<float>(input()), getTensorShape(output()),
++                                          getTensorData<float>(output()));
++      break;
++    case DataType::U8:
++      tflite::optimized_ops::DepthToSpace(op_params, getTensorShape(input()),
++                                          getTensorData<uint8_t>(input()), getTensorShape(output()),
++                                          getTensorData<uint8_t>(output()));
++      break;
++    default:
++      throw std::runtime_error("Unsupported Type.");
++  }
++}
++
++} // namespace kernels
++} // namespace luci_interpreter
+diff --git a/compiler/luci-interpreter/src/kernels/DepthToSpace.h b/compiler/luci-interpreter/src/kernels/DepthToSpace.h
+new file mode 100644
+index 0000000..63ce376
+--- /dev/null
++++ b/compiler/luci-interpreter/src/kernels/DepthToSpace.h
+@@ -0,0 +1,45 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *    http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#ifndef LUCI_INTERPRETER_KERNELS_DEPTHTOSPACE_H
++#define LUCI_INTERPRETER_KERNELS_DEPTHTOSPACE_H
++
++#include "core/Kernel.h"
++#include "core/KernelParams.h"
++
++#include <vector>
++
++namespace luci_interpreter
++{
++namespace kernels
++{
++
++class DepthToSpace : public KernelWithParams<DepthToSpaceParams>
++{
++public:
++  DepthToSpace(const Tensor *input, Tensor *output, const DepthToSpaceParams &params);
++
++  const Tensor *input() const { return _inputs[0]; }
++  Tensor *output() const { return _outputs[0]; }
++
++  void configure() override;
++  void execute() const override;
++};
++
++} // namespace kernels
++} // namespace luci_interpreter
++
++#endif // LUCI_INTERPRETER_KERNELS_DEPTHTOSPACE_H
+diff --git a/compiler/luci-interpreter/src/kernels/DepthToSpace.test.cpp b/compiler/luci-interpreter/src/kernels/DepthToSpace.test.cpp
+new file mode 100644
+index 0000000..1b80570
+--- /dev/null
++++ b/compiler/luci-interpreter/src/kernels/DepthToSpace.test.cpp
+@@ -0,0 +1,60 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *    http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#include "kernels/DepthToSpace.h"
++#include "kernels/TestUtils.h"
++
++namespace luci_interpreter
++{
++namespace kernels
++{
++namespace
++{
++
++using namespace testing;
++
++template <typename T> class DepthToSpaceTest : public ::testing::Test
++{
++};
++
++using DataTypes = ::testing::Types<float, uint8_t>;
++TYPED_TEST_CASE(DepthToSpaceTest, DataTypes);
++
++TYPED_TEST(DepthToSpaceTest, SimpleCase)
++{
++  std::vector<TypeParam> input_data{1, 2, 3, 4, 5, 6, 7, 8};
++  Shape input_shape{1, 1, 2, 4};
++  std::vector<TypeParam> output_data{1, 2, 5, 6, 3, 4, 7, 8};
++  std::vector<int32_t> output_shape{1, 2, 4, 1};
++
++  Tensor input_tensor = makeInputTensor<getElementType<TypeParam>()>(input_shape, input_data);
++  Tensor output_tensor = makeOutputTensor(getElementType<TypeParam>());
++
++  DepthToSpaceParams params{};
++  params.block_size = 2;
++
++  DepthToSpace kernel = DepthToSpace(&input_tensor, &output_tensor, params);
++  kernel.configure();
++  kernel.execute();
++
++  EXPECT_THAT(extractTensorData<TypeParam>(output_tensor),
++              ::testing::ElementsAreArray(output_data));
++  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
++}
++
++} // namespace
++} // namespace kernels
++} // namespace luci_interpreter
+diff --git a/compiler/luci-interpreter/src/kernels/L2Normalize.test.cpp b/compiler/luci-interpreter/src/kernels/L2Normalize.test.cpp
+index fad450d..f53eaca 100644
+--- a/compiler/luci-interpreter/src/kernels/L2Normalize.test.cpp
++++ b/compiler/luci-interpreter/src/kernels/L2Normalize.test.cpp
+@@ -45,12 +45,9 @@ TEST(L2NormalizeTest, Float)
+               ElementsAreArray(ArrayFloatNear(ref_output_data)));
+ }
+ 
+-TEST(L2NormalizeTest, Uint8Quantized)
+-{
+-  // TODO
+-  // Implement GetDequantizedOutput Function.
+-  // Create Test for Uint8 Case
+-}
++// TODO Uint8Quantized
++// Implement GetDequantizedOutput Function.
++// Create Test for Uint8 Case
+ 
+ } // namespace
+ } // namespace kernels
+diff --git a/compiler/luci-interpreter/src/kernels/LeakyRelu.test.cpp b/compiler/luci-interpreter/src/kernels/LeakyRelu.test.cpp
+index b0c06e7..c79d3d6 100644
+--- a/compiler/luci-interpreter/src/kernels/LeakyRelu.test.cpp
++++ b/compiler/luci-interpreter/src/kernels/LeakyRelu.test.cpp
+@@ -61,15 +61,14 @@ TEST(LeakReluTest, FloatSimple)
+                    1.0f, -0.5f, -1.0f, // Row 2
+                },
+                /*alpha=*/0.5f, getElementType<float>());
+-}
+ 
+-TEST(LeakReluTest, Uint8Simple)
+-{
+-  // TODO
+-  // Implement GetDequantizedOutput Function.
+-  // Create Test for Uint8 Case
++  SUCCEED();
+ }
+ 
++// TODO Uint8Simple
++// Implement GetDequantizedOutput Function.
++// Create Test for Uint8 Case
++
+ } // namespace
+ } // namespace kernels
+ } // namespace luci_interpreter
+diff --git a/compiler/luci-interpreter/src/kernels/Logistic.test.cpp b/compiler/luci-interpreter/src/kernels/Logistic.test.cpp
+index 17456a4..00feddf 100644
+--- a/compiler/luci-interpreter/src/kernels/Logistic.test.cpp
++++ b/compiler/luci-interpreter/src/kernels/Logistic.test.cpp
+@@ -49,10 +49,8 @@ TEST(LogisticTest, Float)
+   // TODO make a Shape checking of output_tensor.
+ }
+ 
+-TEST(LogisticTest, Uint8)
+-{
+-  // Need to Implement GetDequantizedOutput Function.
+-}
++// TODO Uint8
++// Need to Implement GetDequantizedOutput Function.
+ 
+ } // namespace
+ } // namespace kernels
+diff --git a/compiler/luci-interpreter/src/kernels/Reverse.cpp b/compiler/luci-interpreter/src/kernels/Reverse.cpp
+new file mode 100644
+index 0000000..a463084
+--- /dev/null
++++ b/compiler/luci-interpreter/src/kernels/Reverse.cpp
+@@ -0,0 +1,81 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *    http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#include "kernels/Reverse.h"
++#include "kernels/Utils.h"
++#include <tensorflow/lite/kernels/internal/reference/reference_ops.h>
++
++namespace luci_interpreter
++{
++
++namespace kernels
++{
++
++Reverse::Reverse(const Tensor *input, const Tensor *axes, Tensor *output)
++    : Kernel({input, axes}, {output})
++{
++}
++
++void Reverse::configure()
++{
++  assert(axes()->shape().num_dims() == 1);
++  assert(input()->shape().num_dims() >= axes()->shape().num_elements());
++  if (input()->element_type() != DataType::S32 && input()->element_type() != DataType::FLOAT32 &&
++      input()->element_type() != DataType::U8 && input()->element_type() != DataType::S16 &&
++      input()->element_type() != DataType::S64)
++  {
++    throw std::runtime_error("Unsupported input type.");
++  }
++  if (axes()->element_type() != DataType::S32)
++  {
++    throw std::runtime_error("Unsupported axes type.");
++  }
++  if (axes()->shape().num_elements() > 1)
++  {
++    throw std::runtime_error("Current implementation does not support more than 1 axis.");
++  }
++  int axis_value = getTensorData<int32_t>(axes())[0];
++  if (axis_value < 0 || axis_value >= input()->shape().num_dims())
++  {
++    throw std::runtime_error("Invalid axes value");
++  }
++  assert(input()->element_type() == output()->element_type());
++
++  output()->resize(input()->shape());
++}
++
++void Reverse::execute() const
++{
++  int axis_value = getTensorData<int32_t>(axes())[0];
++  switch (output()->element_type())
++  {
++    case DataType::FLOAT32:
++      tflite::reference_ops::Reverse<float>(axis_value, getTensorShape(input()),
++                                            getTensorData<float>(input()), getTensorShape(output()),
++                                            getTensorData<float>(output()));
++      break;
++    case DataType::U8:
++      tflite::reference_ops::Reverse<uint8_t>(
++          axis_value, getTensorShape(input()), getTensorData<uint8_t>(input()),
++          getTensorShape(output()), getTensorData<uint8_t>(output()));
++      break;
++    default:
++      throw std::runtime_error("Unsupported output type");
++  }
++}
++
++} // namespace kernels
++} // namespace luci_interpreter
+diff --git a/compiler/luci-interpreter/src/kernels/Reverse.h b/compiler/luci-interpreter/src/kernels/Reverse.h
+new file mode 100644
+index 0000000..3489dae
+--- /dev/null
++++ b/compiler/luci-interpreter/src/kernels/Reverse.h
+@@ -0,0 +1,43 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *    http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#ifndef LUCI_INTERPRETER_KERNELS_REVERSE_H
++#define LUCI_INTERPRETER_KERNELS_REVERSE_H
++
++#include "core/Kernel.h"
++
++namespace luci_interpreter
++{
++namespace kernels
++{
++
++class Reverse : public Kernel
++{
++public:
++  Reverse(const Tensor *input, const Tensor *axes, Tensor *output);
++
++  const Tensor *input() const { return _inputs[0]; }
++  const Tensor *axes() const { return _inputs[1]; }
++  Tensor *output() const { return _outputs[0]; }
++
++  void configure() override;
++  void execute() const override;
++};
++
++} // namespace kernels
++} // namespace luci_interpreter
++
++#endif // LUCI_INTERPRETER_KERNELS_REVERSE_H
+diff --git a/compiler/luci-interpreter/src/kernels/Reverse.test.cpp b/compiler/luci-interpreter/src/kernels/Reverse.test.cpp
+new file mode 100644
+index 0000000..5475a8b
+--- /dev/null
++++ b/compiler/luci-interpreter/src/kernels/Reverse.test.cpp
+@@ -0,0 +1,66 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *    http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#include "kernels/Reverse.h"
++#include "kernels/TestUtils.h"
++
++namespace luci_interpreter
++{
++namespace kernels
++{
++namespace
++{
++
++using namespace testing;
++
++template <typename T> class ReverseTest : public ::testing::Test
++{
++};
++
++using DataTypes = ::testing::Types<float, uint8_t>;
++TYPED_TEST_CASE(ReverseTest, DataTypes);
++
++TYPED_TEST(ReverseTest, MultiDimensions)
++{
++  // TypeParam
++  std::vector<TypeParam> input_data{1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
++                                    13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24};
++  Shape input_shape{4, 3, 2};
++  std::vector<int32_t> axis_data{1};
++  Shape axis_shape{1};
++
++  std::vector<TypeParam> output_data{5,  6,  3,  4,  1,  2,  11, 12, 9,  10, 7,  8,
++                                     17, 18, 15, 16, 13, 14, 23, 24, 21, 22, 19, 20};
++  std::vector<int32_t> output_shape{4, 3, 2};
++
++  Tensor input_tensor = makeInputTensor<getElementType<TypeParam>()>(input_shape, input_data);
++  Tensor axis_tensor = makeInputTensor<DataType::S32>(axis_shape, axis_data);
++
++  Tensor output_tensor = makeOutputTensor(getElementType<TypeParam>());
++
++  Reverse kernel = Reverse(&input_tensor, &axis_tensor, &output_tensor);
++  kernel.configure();
++  kernel.execute();
++
++  EXPECT_THAT(extractTensorData<TypeParam>(output_tensor),
++              ::testing::ElementsAreArray(output_data));
++  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
++}
++
++} // namespace
++} // namespace kernels
++} // namespace luci_interpreter
+diff --git a/compiler/luci-interpreter/src/kernels/Slice.cpp b/compiler/luci-interpreter/src/kernels/Slice.cpp
+new file mode 100644
+index 0000000..c4bc3c5
+--- /dev/null
++++ b/compiler/luci-interpreter/src/kernels/Slice.cpp
+@@ -0,0 +1,149 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *    http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#include "kernels/Slice.h"
++#include "Utils.h"
++#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
++
++#include <cassert>
++#include <cstring>
++
++namespace luci_interpreter
++{
++
++namespace kernels
++{
++const int max_dim = 4;
++
++Slice::Slice(const Tensor *input, const Tensor *begin, const Tensor *size, Tensor *output)
++    : Kernel({input, begin, size}, {output})
++{
++}
++
++template <typename T>
++Shape calculateOutputShape(const Tensor *input, const Tensor *begin, const Tensor *size)
++{
++  Shape output_shape = Shape(input->shape().num_dims());
++  for (int idx = 0; idx < input->shape().num_dims(); idx++)
++  {
++    T size_value = getTensorData<T>(size)[idx];
++    if (size_value < 0)
++    {
++      if (size_value != -1)
++      {
++        throw std::runtime_error("Invalid size.");
++      }
++      size_value = input->shape().dim(idx) - getTensorData<T>(begin)[idx];
++    }
++    else
++    {
++      if (input->shape().dim(idx) < getTensorData<T>(begin)[idx] + size_value)
++      {
++        throw std::runtime_error("Invalid begin and size.");
++      }
++    }
++    output_shape.dim(idx) = static_cast<int>(size_value);
++  }
++  return output_shape;
++}
++
++template <typename T>
++void getBeginAndSizeVectors(int dimensions, const Tensor *begin, const Tensor *size,
++                            std::vector<int> *begins, std::vector<int> *sizes)
++{
++  for (int idx = dimensions - 1; idx >= 0; --idx)
++  {
++    begins->push_back(getTensorData<T>(begin)[idx]);
++    sizes->push_back(getTensorData<T>(size)[idx]);
++  }
++}
++
++void Slice::configure()
++{
++  assert(input()->element_type() == output()->element_type());
++  assert(begin()->element_type() == DataType::S32 || begin()->element_type() == DataType::S64);
++  assert(size()->element_type() == DataType::S32 || size()->element_type() == DataType::S64);
++  assert(begin()->shape().num_dims() == 1);
++  assert(size()->shape().num_dims() == 1);
++  assert(input()->shape().num_dims() <= max_dim);
++
++  if (begin()->element_type() == DataType::S32)
++  {
++    output()->resize(calculateOutputShape<int32_t>(input(), begin(), size()));
++  }
++  else if (begin()->element_type() == DataType::S64)
++  {
++    output()->resize(calculateOutputShape<int64_t>(input(), begin(), size()));
++  }
++  else
++  {
++    throw std::runtime_error("Unsupported type.");
++  }
++}
++
++void Slice::execute() const
++{
++  std::vector<int> begins;
++  begins.reserve(max_dim);
++  std::vector<int> sizes;
++  sizes.reserve(max_dim);
++  if (begin()->element_type() == DataType::S32)
++  {
++    getBeginAndSizeVectors<int32_t>(input()->shape().num_dims(), begin(), size(), &begins, &sizes);
++  }
++  else if (begin()->element_type() == DataType::S64)
++  {
++    getBeginAndSizeVectors<int64_t>(input()->shape().num_dims(), begin(), size(), &begins, &sizes);
++  }
++  else
++  {
++    throw std::runtime_error("Unsupported begin type.");
++  }
++  for (int i = input()->shape().num_dims(); i < max_dim; ++i)
++  {
++    begins.push_back(0);
++    sizes.push_back(1);
++  }
++
++  assert(begins.size() == 4);
++  assert(sizes.size() == 4);
++  tflite::SliceParams op_params{};
++  op_params.begin_count = 4;
++  op_params.size_count = 4;
++  for (int i = 0; i < 4; i++)
++  {
++    op_params.begin[i] = begins[3 - i];
++    op_params.size[i] = sizes[3 - i];
++  }
++  switch (input()->element_type())
++  {
++    case DataType::FLOAT32:
++      tflite::optimized_ops::Slice(op_params, getTensorShape(input()),
++                                   getTensorData<float>(input()), getTensorShape(output()),
++                                   getTensorData<float>(output()));
++      break;
++    case DataType::U8:
++      tflite::optimized_ops::Slice(op_params, getTensorShape(input()),
++                                   getTensorData<uint8_t>(input()), getTensorShape(output()),
++                                   getTensorData<uint8_t>(output()));
++      break;
++    default:
++      throw std::runtime_error("Unsupported input type.");
++  }
++}
++
++} // namespace kernels
++} // namespace luci_interpreter
+diff --git a/compiler/luci-interpreter/src/kernels/Slice.h b/compiler/luci-interpreter/src/kernels/Slice.h
+new file mode 100644
+index 0000000..23c3596
+--- /dev/null
++++ b/compiler/luci-interpreter/src/kernels/Slice.h
+@@ -0,0 +1,44 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *    http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#ifndef LUCI_INTERPRETER_KERNELS_SLICE_H
++#define LUCI_INTERPRETER_KERNELS_SLICE_H
++
++#include "core/Kernel.h"
++
++namespace luci_interpreter
++{
++namespace kernels
++{
++
++class Slice : public Kernel
++{
++public:
++  Slice(const Tensor *input, const Tensor *begin, const Tensor *size, Tensor *output);
++
++  const Tensor *input() const { return _inputs[0]; }
++  const Tensor *begin() const { return _inputs[1]; }
++  const Tensor *size() const { return _inputs[2]; }
++  Tensor *output() const { return _outputs[0]; }
++
++  void configure() override;
++  void execute() const override;
++};
++
++} // namespace kernels
++} // namespace luci_interpreter
++
++#endif // LUCI_INTERPRETER_KERNELS_SLICE_H
+diff --git a/compiler/luci-interpreter/src/kernels/Slice.test.cpp b/compiler/luci-interpreter/src/kernels/Slice.test.cpp
+new file mode 100644
+index 0000000..a360a29
+--- /dev/null
++++ b/compiler/luci-interpreter/src/kernels/Slice.test.cpp
+@@ -0,0 +1,64 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *    http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#include "kernels/Slice.h"
++#include "kernels/TestUtils.h"
++
++namespace luci_interpreter
++{
++namespace kernels
++{
++namespace
++{
++
++using namespace testing;
++
++template <typename T> class SliceTest : public ::testing::Test
++{
++};
++
++using DataTypes = ::testing::Types<float, uint8_t>;
++TYPED_TEST_CASE(SliceTest, DataTypes);
++
++TYPED_TEST(SliceTest, SimpleTest)
++{
++  std::vector<TypeParam> input_data{1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6};
++  Shape input_shape{3, 2, 3, 1};
++  std::vector<int32_t> begin_data{1, 0, 0, 0};
++  Shape begin_shape{4};
++  std::vector<int32_t> size_data{2, 1, -1, 1};
++  Shape size_shape{4};
++  std::vector<TypeParam> output_data{3, 3, 3, 5, 5, 5};
++  std::vector<int32_t> output_shape{2, 1, 3, 1};
++
++  Tensor input_tensor = makeInputTensor<getElementType<TypeParam>()>(input_shape, input_data);
++  Tensor begin_tensor = makeInputTensor<DataType::S32>(begin_shape, begin_data);
++  Tensor size_tensor = makeInputTensor<DataType::S32>(size_shape, size_data);
++
++  Tensor output_tensor = makeOutputTensor(getElementType<TypeParam>());
++
++  Slice kernel(&input_tensor, &begin_tensor, &size_tensor, &output_tensor);
++  kernel.configure();
++  kernel.execute();
++
++  EXPECT_THAT(extractTensorData<TypeParam>(output_tensor),
++              ::testing::ElementsAreArray(output_data));
++  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
++}
++
++} // namespace
++} // namespace kernels
++} // namespace luci_interpreter
+diff --git a/compiler/luci-interpreter/src/kernels/TransposeConv.test.cpp b/compiler/luci-interpreter/src/kernels/TransposeConv.test.cpp
+index 3386d36..b8c0ac4 100644
+--- a/compiler/luci-interpreter/src/kernels/TransposeConv.test.cpp
++++ b/compiler/luci-interpreter/src/kernels/TransposeConv.test.cpp
+@@ -68,6 +68,8 @@ TEST(TransposeConvTest, FloatSimple)
+       /*output_data=*/{29, 62, 83, 75, 99, 192, 237, 198, 207, 372, 417, 330, 263, 446, 485, 365},
+       /*params.padding=*/luci::Padding::SAME, /*stride_height=*/1, /*stride_width=*/1,
+       getElementType<float>());
++
++  SUCCEED();
+ }
+ 
+ TEST(TransposeConvTest, FloatTwoFiltersTest)
+@@ -82,21 +84,18 @@ TEST(TransposeConvTest, FloatTwoFiltersTest)
+                        3352, 3652, 2760},
+       /*params.padding=*/luci::Padding::SAME, /*stride_height=*/1, /*stride_width=*/1,
+       getElementType<float>());
+-}
+ 
+-TEST(TransposeConvTest, Uint8Simple)
+-{
+-  // TODO
+-  // Implement GetDequantizedOutput Function.
+-  // Create Test for Uint8 Case
+-}
+-TEST(TransposeConvTest, Uint8FiltersTest)
+-{
+-  // TODO
+-  // Implement GetDequantizedOutput Function.
+-  // Create Test for Uint8 Case
++  SUCCEED();
+ }
+ 
++// TODO Uint8Simple
++// Implement GetDequantizedOutput Function.
++// Create Test for Uint8 Case
++
++// TODO Uint8FiltersTest
++// Implement GetDequantizedOutput Function.
++// Create Test for Uint8 Case
++
+ } // namespace
+ } // namespace kernels
+ } // namespace luci_interpreter
+diff --git a/compiler/luci-interpreter/src/loader/CMakeLists.txt b/compiler/luci-interpreter/src/loader/CMakeLists.txt
+index fb36c4a..d99485d 100644
+--- a/compiler/luci-interpreter/src/loader/CMakeLists.txt
++++ b/compiler/luci-interpreter/src/loader/CMakeLists.txt
+@@ -1,3 +1,5 @@
++nnas_find_package(GTest REQUIRED)
++
+ set(SOURCES
+     GraphLoader.h
+     GraphLoader.cpp
+@@ -13,3 +15,8 @@ target_include_directories(luci_interpreter_loader PUBLIC "${LUCI_INTERPRETER_SO
+ target_link_libraries(luci_interpreter_loader
+     PUBLIC luci_lang luci_interpreter_core
+     PRIVATE luci_interpreter_kernels nncc_common)
++
++set(TEST_SOURCES KernelBuilder.test.cpp)
++
++GTest_AddTest(luci_interpreter_loader_test ${TEST_SOURCES})
++target_link_libraries(luci_interpreter_loader_test luci_interpreter_loader)
+diff --git a/compiler/luci-interpreter/src/loader/GraphLoader.cpp b/compiler/luci-interpreter/src/loader/GraphLoader.cpp
+index 779fa06..6ebf979 100644
+--- a/compiler/luci-interpreter/src/loader/GraphLoader.cpp
++++ b/compiler/luci-interpreter/src/loader/GraphLoader.cpp
+@@ -16,7 +16,6 @@
+ 
+ #include "loader/GraphLoader.h"
+ 
+-#include "loader/ModuleLoader.h"
+ #include "loader/KernelBuilder.h"
+ 
+ #include <loco/IR/Algorithm.h>
+@@ -71,6 +70,7 @@ bool isExecutableNode(const luci::CircleNode *node)
+   {
+     // These nodes denote inputs / outputs of a graph.
+     case luci::CircleOpcode::CONST:
++    case luci::CircleOpcode::CIRCLECONST:
+     case luci::CircleOpcode::CIRCLEINPUT:
+     case luci::CircleOpcode::CIRCLEOUTPUT:
+     // The following nodes denote outputs of multiple-output nodes.
+@@ -102,11 +102,12 @@ bool isTensorProducingNode(const luci::CircleNode *node)
+ 
+ } // namespace
+ 
+-GraphLoader::GraphLoader(const ModuleLoader &module_loader, const loco::Graph *graph,
+-                         RuntimeGraph *runtime_graph, RuntimeToIR &runtime_to_ir,
+-                         std::unordered_map<const loco::Node *, Tensor *> &node_to_tensor)
+-    : _module_loader(module_loader), _graph(graph), _runtime_graph(runtime_graph),
+-      _runtime_to_ir(runtime_to_ir), _node_to_tensor(node_to_tensor)
++GraphLoader::GraphLoader(
++    const loco::Graph *graph, RuntimeGraph *runtime_graph, RuntimeToIR &runtime_to_ir,
++    const std::unordered_map<const loco::Graph *, RuntimeGraph *> &graph_to_runtime_graph,
++    std::unordered_map<const loco::Node *, Tensor *> &node_to_tensor)
++    : _graph(graph), _runtime_graph(runtime_graph), _runtime_to_ir(runtime_to_ir),
++      _graph_to_runtime_graph(graph_to_runtime_graph), _node_to_tensor(node_to_tensor)
+ {
+ }
+ 
+@@ -136,6 +137,7 @@ void GraphLoader::loadTensors()
+       const luci::CircleQuantParam *params = node->quantparam();
+       quantization.scale.assign(params->scale.cbegin(), params->scale.cend());
+       quantization.zero_point.assign(params->zerop.cbegin(), params->zerop.cend());
++      quantization.quantized_dimension = params->quantized_dimension;
+     }
+ 
+     auto tensor = std::make_unique<Tensor>(node->dtype(), std::move(shape), std::move(quantization),
+@@ -178,7 +180,7 @@ void GraphLoader::initInputOutputTensors() const
+ 
+ void GraphLoader::loadOperators()
+ {
+-  KernelBuilder kernel_builder(_module_loader, *this);
++  KernelBuilder kernel_builder(_graph_to_runtime_graph, _node_to_tensor);
+ 
+   // Create kernels for executable nodes. This has to be done in execution order.
+   for (const loco::Node *loco_node :
+@@ -195,11 +197,4 @@ void GraphLoader::loadOperators()
+   }
+ }
+ 
+-void GraphLoader::load()
+-{
+-  loadTensors();
+-  initInputOutputTensors();
+-  loadOperators();
+-}
+-
+ } // namespace luci_interpreter
+diff --git a/compiler/luci-interpreter/src/loader/GraphLoader.h b/compiler/luci-interpreter/src/loader/GraphLoader.h
+index e0adc0f..89c5bca 100644
+--- a/compiler/luci-interpreter/src/loader/GraphLoader.h
++++ b/compiler/luci-interpreter/src/loader/GraphLoader.h
+@@ -27,29 +27,23 @@
+ namespace luci_interpreter
+ {
+ 
+-class ModuleLoader;
+-
+ class GraphLoader
+ {
+ public:
+-  GraphLoader(const ModuleLoader &module_loader, const loco::Graph *graph,
+-              RuntimeGraph *runtime_graph, RuntimeToIR &runtime_to_ir,
++  GraphLoader(const loco::Graph *graph, RuntimeGraph *runtime_graph, RuntimeToIR &runtime_to_ir,
++              const std::unordered_map<const loco::Graph *, RuntimeGraph *> &graph_to_runtime_graph,
+               std::unordered_map<const loco::Node *, Tensor *> &node_to_tensor);
+ 
+-  void load();
+-
+-  Tensor *getTensorForNode(const loco::Node *node) const { return _node_to_tensor.at(node); }
+-
+-private:
+-  void loadOperators();
+-  void initInputOutputTensors() const;
+   void loadTensors();
++  void initInputOutputTensors() const;
++  void loadOperators();
+ 
+-  const ModuleLoader &_module_loader;
++private:
+   const loco::Graph *_graph;
+   RuntimeGraph *_runtime_graph;
+   RuntimeToIR &_runtime_to_ir;
+ 
++  const std::unordered_map<const loco::Graph *, RuntimeGraph *> &_graph_to_runtime_graph;
+   std::unordered_map<const loco::Node *, Tensor *> &_node_to_tensor;
+ };
+ 
+diff --git a/compiler/luci-interpreter/src/loader/KernelBuilder.cpp b/compiler/luci-interpreter/src/loader/KernelBuilder.cpp
+index 56da961..c19f897 100644
+--- a/compiler/luci-interpreter/src/loader/KernelBuilder.cpp
++++ b/compiler/luci-interpreter/src/loader/KernelBuilder.cpp
+@@ -21,6 +21,7 @@
+ #include "kernels/AveragePool2D.h"
+ #include "kernels/Concatenation.h"
+ #include "kernels/Conv2D.h"
++#include "kernels/DepthToSpace.h"
+ #include "kernels/DepthwiseConv2D.h"
+ #include "kernels/Elu.h"
+ #include "kernels/FullyConnected.h"
+@@ -35,6 +36,8 @@
+ #include "kernels/Mul.h"
+ #include "kernels/Pad.h"
+ #include "kernels/Reshape.h"
++#include "kernels/Reverse.h"
++#include "kernels/Slice.h"
+ #include "kernels/Softmax.h"
+ #include "kernels/SpaceToDepth.h"
+ #include "kernels/Split.h"
+@@ -43,8 +46,6 @@
+ #include "kernels/Unpack.h"
+ #include "kernels/Transpose.h"
+ #include "kernels/TransposeConv.h"
+-#include "loader/GraphLoader.h"
+-#include "loader/ModuleLoader.h"
+ 
+ #include <stdexcept>
+ 
+@@ -68,7 +69,7 @@ static std::vector<const loco::Node *> collectOutputNodes(const luci::CircleNode
+ 
+ const Tensor *KernelBuilder::getInputTensor(const loco::Node *node) const
+ {
+-  const Tensor *tensor = _graph_loader.getTensorForNode(node);
++  const Tensor *tensor = _node_to_tensor.at(node);
+   assert(tensor != nullptr);
+   return tensor;
+ }
+@@ -81,7 +82,7 @@ const Tensor *KernelBuilder::getOptionalInputTensor(const loco::Node *node) cons
+ 
+ Tensor *KernelBuilder::getOutputTensor(const loco::Node *node) const
+ {
+-  Tensor *tensor = _graph_loader.getTensorForNode(node);
++  Tensor *tensor = _node_to_tensor.at(node);
+   assert(tensor != nullptr);
+   return tensor;
+ }
+@@ -98,7 +99,7 @@ KernelBuilder::getOutputTensors(const std::vector<const loco::Node *> &nodes) co
+ 
+ RuntimeGraph *KernelBuilder::getRuntimeGraph(const loco::Graph *graph) const
+ {
+-  RuntimeGraph *runtime_graph = _module_loader.getRuntimeGraph(graph);
++  RuntimeGraph *runtime_graph = _graph_to_runtime_graph.at(graph);
+   assert(runtime_graph != nullptr);
+   return runtime_graph;
+ }
+@@ -120,14 +121,14 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleAdd *node)
+ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleArgMax *node)
+ {
+   assert(node->arity() == 2);
+-  const Tensor *input1 = getInputTensor(node->input());
+-  const Tensor *input2 = getInputTensor(node->dimension());
++  const Tensor *input = getInputTensor(node->input());
++  const Tensor *axis = getInputTensor(node->dimension());
+   Tensor *output = getOutputTensor(node);
+ 
+   ArgMaxParams params{};
+   params.output_type = node->output_type();
+ 
+-  return std::make_unique<kernels::ArgMax>(input1, input2, output, params);
++  return std::make_unique<kernels::ArgMax>(input, axis, output, params);
+ }
+ 
+ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleAveragePool2D *node)
+@@ -188,6 +189,19 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleConv2D *node)
+   return std::make_unique<kernels::Conv2D>(input, filter, bias, output, params);
+ }
+ 
++std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleDepthToSpace *node)
++{
++  assert(node->arity() == 1);
++
++  const Tensor *input = getInputTensor(node->input());
++  Tensor *output = getOutputTensor(node);
++
++  DepthToSpaceParams params{};
++  params.block_size = node->block_size();
++
++  return std::make_unique<kernels::DepthToSpace>(input, output, params);
++}
++
+ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleDepthwiseConv2D *node)
+ {
+   assert(node->arity() == 3);
+@@ -224,14 +238,14 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleFullyConnected *n
+   assert(node->arity() == 3);
+ 
+   const Tensor *input = getInputTensor(node->input());
+-  const Tensor *filter = getInputTensor(node->weights());
++  const Tensor *weights = getInputTensor(node->weights());
+   const Tensor *bias = getOptionalInputTensor(node->bias());
+   Tensor *output = getOutputTensor(node);
+ 
+   FullyConnectedParams params{};
+   params.activation = node->fusedActivationFunction();
+ 
+-  return std::make_unique<kernels::FullyConnected>(input, filter, bias, output, params);
++  return std::make_unique<kernels::FullyConnected>(input, weights, bias, output, params);
+ }
+ 
+ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleIf *node)
+@@ -255,6 +269,11 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleIf *node)
+                                        else_graph);
+ }
+ 
++std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleInput *)
++{
++  throw std::runtime_error("Input node cannot be executed.");
++}
++
+ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleL2Normalize *node)
+ {
+   assert(node->arity() == 1);
+@@ -323,11 +342,6 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleLogistic *node)
+   return std::make_unique<kernels::Logistic>(input, output);
+ }
+ 
+-std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleInput *)
+-{
+-  throw std::runtime_error("Input node cannot be executed.");
+-}
+-
+ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleMaxPool2D *node)
+ {
+   assert(node->arity() == 1);
+@@ -402,6 +416,30 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleReshape *node)
+   return std::make_unique<kernels::Reshape>(input, shape, output);
+ }
+ 
++std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleReverseV2 *node)
++{
++  assert(node->arity() == 2);
++
++  const Tensor *input = getInputTensor(node->tensor());
++  const Tensor *axes = getInputTensor(node->axis());
++  Tensor *output = getOutputTensor(node);
++
++  return std::make_unique<kernels::Reverse>(input, axes, output);
++}
++
++std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleSlice *node)
++{
++  assert(node->arity() == 3);
++
++  const Tensor *input = getInputTensor(node->input());
++  const Tensor *begin = getInputTensor(node->begin());
++  const Tensor *size = getInputTensor(node->size());
++
++  Tensor *output = getOutputTensor(node);
++
++  return std::make_unique<kernels::Slice>(input, begin, size, output);
++}
++
+ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleSoftmax *node)
+ {
+   assert(node->arity() == 1);
+@@ -442,6 +480,19 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleSplit *node)
+   return std::make_unique<kernels::Split>(axis, input, std::move(outputs));
+ }
+ 
++std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleSqueeze *node)
++{
++  assert(node->arity() == 1);
++
++  const Tensor *input = getInputTensor(node->input());
++  Tensor *output = getOutputTensor(node);
++
++  SqueezeParams params{};
++  params.squeeze_dims = node->squeeze_dims();
++
++  return std::make_unique<kernels::Squeeze>(input, output, params);
++}
++
+ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleStridedSlice *node)
+ {
+   assert(node->arity() == 4);
+@@ -463,21 +514,15 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleStridedSlice *nod
+   return std::make_unique<kernels::StridedSlice>(input, begin, end, strides, output, params);
+ }
+ 
+-std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleSqueeze *node)
++std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleTranspose *node)
+ {
+-  assert(node->arity() == 1);
++  assert(node->arity() == 2);
+ 
+-  const Tensor *input = getInputTensor(node->input());
++  const Tensor *input = getInputTensor(node->a());
++  const Tensor *perm = getInputTensor(node->perm());
+   Tensor *output = getOutputTensor(node);
+ 
+-  SqueezeParams params{};
+-  assert(node->squeeze_dims().size() <= 4);
+-  for (size_t i = 0; i < node->squeeze_dims().size(); i++)
+-  {
+-    params.squeeze_dims.push_back(node->squeeze_dims().at(i));
+-  }
+-
+-  return std::make_unique<kernels::Squeeze>(input, output, params);
++  return std::make_unique<kernels::Transpose>(input, perm, output);
+ }
+ 
+ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleTransposeConv *node)
+@@ -515,15 +560,4 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleUnpack *node)
+   return std::make_unique<kernels::Unpack>(input, std::move(outputs), params);
+ }
+ 
+-std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleTranspose *node)
+-{
+-  assert(node->arity() == 2);
+-
+-  const Tensor *input = getInputTensor(node->a());
+-  const Tensor *perm = getInputTensor(node->perm());
+-  Tensor *output = getOutputTensor(node);
+-
+-  return std::make_unique<kernels::Transpose>(input, perm, output);
+-}
+-
+ } // namespace luci_interpreter
+diff --git a/compiler/luci-interpreter/src/loader/KernelBuilder.h b/compiler/luci-interpreter/src/loader/KernelBuilder.h
+index 7e30d39..d5c5a4b 100644
+--- a/compiler/luci-interpreter/src/loader/KernelBuilder.h
++++ b/compiler/luci-interpreter/src/loader/KernelBuilder.h
+@@ -24,18 +24,18 @@
+ 
+ #include <memory>
+ #include <vector>
++#include <unordered_map>
+ 
+ namespace luci_interpreter
+ {
+ 
+-class GraphLoader;
+-class ModuleLoader;
+-
+ class KernelBuilder : public luci::CircleNodeVisitor<std::unique_ptr<Kernel>>
+ {
+ public:
+-  KernelBuilder(const ModuleLoader &module_loader, const GraphLoader &graph_loader)
+-      : _module_loader(module_loader), _graph_loader(graph_loader)
++  KernelBuilder(
++      const std::unordered_map<const loco::Graph *, RuntimeGraph *> &graph_to_runtime_graph,
++      const std::unordered_map<const loco::Node *, Tensor *> &node_to_tensor)
++      : _graph_to_runtime_graph(graph_to_runtime_graph), _node_to_tensor(node_to_tensor)
+   {
+   }
+ 
+@@ -45,6 +45,7 @@ public:
+   std::unique_ptr<Kernel> visit(const luci::CircleConcatenation *node) override;
+   std::unique_ptr<Kernel> visit(const luci::CircleConv2D *node) override;
+   std::unique_ptr<Kernel> visit(const luci::CircleConst *node) override;
++  std::unique_ptr<Kernel> visit(const luci::CircleDepthToSpace *node) override;
+   std::unique_ptr<Kernel> visit(const luci::CircleDepthwiseConv2D *node) override;
+   std::unique_ptr<Kernel> visit(const luci::CircleElu *node) override;
+   std::unique_ptr<Kernel> visit(const luci::CircleFullyConnected *node) override;
+@@ -61,6 +62,8 @@ public:
+   std::unique_ptr<Kernel> visit(const luci::CircleOutput *node) override;
+   std::unique_ptr<Kernel> visit(const luci::CirclePad *node) override;
+   std::unique_ptr<Kernel> visit(const luci::CircleReshape *node) override;
++  std::unique_ptr<Kernel> visit(const luci::CircleReverseV2 *node) override;
++  std::unique_ptr<Kernel> visit(const luci::CircleSlice *node) override;
+   std::unique_ptr<Kernel> visit(const luci::CircleSoftmax *node) override;
+   std::unique_ptr<Kernel> visit(const luci::CircleSpaceToDepth *node) override;
+   std::unique_ptr<Kernel> visit(const luci::CircleSplit *node) override;
+@@ -82,8 +85,8 @@ private:
+   RuntimeGraph *getRuntimeGraph(const loco::Graph *graph) const;
+ 
+ private:
+-  const ModuleLoader &_module_loader;
+-  const GraphLoader &_graph_loader;
++  const std::unordered_map<const loco::Graph *, RuntimeGraph *> &_graph_to_runtime_graph;
++  const std::unordered_map<const loco::Node *, Tensor *> &_node_to_tensor;
+ };
+ 
+ } // namespace luci_interpreter
+diff --git a/compiler/luci-interpreter/src/loader/KernelBuilder.test.cpp b/compiler/luci-interpreter/src/loader/KernelBuilder.test.cpp
+new file mode 100644
+index 0000000..33bc8ec
+--- /dev/null
++++ b/compiler/luci-interpreter/src/loader/KernelBuilder.test.cpp
+@@ -0,0 +1,743 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *    http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#include "loader/GraphLoader.h"
++#include "loader/KernelBuilder.h"
++
++#include <kernels/Add.h>
++#include <kernels/ArgMax.h>
++#include <kernels/AveragePool2D.h>
++#include <kernels/Concatenation.h>
++#include <kernels/Conv2D.h>
++#include <kernels/DepthToSpace.h>
++#include <kernels/DepthwiseConv2D.h>
++#include <kernels/Elu.h>
++#include <kernels/FullyConnected.h>
++#include <kernels/L2Normalize.h>
++#include <kernels/L2Pool2D.h>
++#include <kernels/LeakyRelu.h>
++#include <kernels/LocalResponseNormalization.h>
++#include <kernels/Logistic.h>
++#include <kernels/MaxPool2D.h>
++#include <kernels/Mean.h>
++#include <kernels/Mul.h>
++#include <kernels/Pad.h>
++#include <kernels/Reshape.h>
++#include <kernels/Reverse.h>
++#include <kernels/Slice.h>
++#include <kernels/Softmax.h>
++#include <kernels/SpaceToDepth.h>
++#include <kernels/Split.h>
++#include <kernels/Squeeze.h>
++#include <kernels/StridedSlice.h>
++#include <kernels/Transpose.h>
++#include <kernels/TransposeConv.h>
++#include <kernels/Unpack.h>
++
++#include <gmock/gmock.h>
++
++namespace luci_interpreter
++{
++namespace
++{
++
++using namespace testing;
++
++class KernelBuilderTest : public Test
++{
++protected:
++  luci::CircleInput *createInputNode() { return createNode<luci::CircleInput>(); }
++
++  template <typename NodeT, typename... Args> NodeT *createNode(Args &&... args)
++  {
++    auto *node = _graph.nodes()->create<NodeT>(std::forward<Args>(args)...);
++    // The actual type does not matter for the purpose of the tests.
++    // NOTE The type is meaningless for nodes with multiple outputs (corresponding *Out nodes carry
++    //  actual output types).
++    node->dtype(loco::DataType::FLOAT32);
++    return node;
++  }
++
++  template <typename NodeOutT> NodeOutT *createNodeOut(loco::Node *node, int index)
++  {
++    auto *node_out = createNode<NodeOutT>();
++    node_out->input(node);
++    node_out->index(index);
++    return node_out;
++  }
++
++  template <typename KernelT> std::unique_ptr<KernelT> buildKernel(const luci::CircleNode *op)
++  {
++    std::unordered_map<const loco::Graph *, RuntimeGraph *> graph_to_runtime_graph;
++
++    RuntimeGraph runtime_graph(nullptr);
++    RuntimeToIR runtime_to_ir;
++    GraphLoader graph_loader(&_graph, &runtime_graph, runtime_to_ir, graph_to_runtime_graph,
++                             _node_to_tensor);
++    graph_loader.loadTensors();
++
++    KernelBuilder kernel_builder(graph_to_runtime_graph, _node_to_tensor);
++
++    auto kernel = op->accept(&kernel_builder);
++    return std::unique_ptr<KernelT>(dynamic_cast<KernelT *>(kernel.release()));
++  }
++
++  void checkTensor(const Tensor *tensor, const loco::Node *node)
++  {
++    EXPECT_THAT(tensor, Eq(_node_to_tensor.at(node)));
++  }
++
++private:
++  loco::Graph _graph;
++  std::unordered_map<const loco::Node *, Tensor *> _node_to_tensor;
++};
++
++TEST_F(KernelBuilderTest, Add)
++{
++  auto *input1 = createInputNode();
++  auto *input2 = createInputNode();
++
++  auto *op = createNode<luci::CircleAdd>();
++  op->x(input1);
++  op->y(input2);
++
++  op->fusedActivationFunction(luci::FusedActFunc::RELU);
++
++  auto kernel = buildKernel<kernels::Add>(op);
++  ASSERT_THAT(kernel, NotNull());
++
++  checkTensor(kernel->input1(), input1);
++  checkTensor(kernel->input2(), input2);
++  checkTensor(kernel->output(), op);
++  EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
++}
++
++TEST_F(KernelBuilderTest, ArgMax)
++{
++  auto *input = createInputNode();
++  auto *axis = createInputNode();
++
++  auto *op = createNode<luci::CircleArgMax>();
++  op->input(input);
++  op->dimension(axis);
++
++  op->output_type(loco::DataType::FLOAT32);
++
++  auto kernel = buildKernel<kernels::ArgMax>(op);
++  ASSERT_THAT(kernel, NotNull());
++
++  checkTensor(kernel->input(), input);
++  checkTensor(kernel->axis(), axis);
++  checkTensor(kernel->output(), op);
++  EXPECT_THAT(kernel->params().output_type, Eq(op->output_type()));
++}
++
++TEST_F(KernelBuilderTest, AveragePool2D)
++{
++  auto *input = createInputNode();
++
++  auto *op = createNode<luci::CircleAveragePool2D>();
++  op->value(input);
++
++  op->padding(luci::Padding::SAME);
++  op->filter()->h(11);
++  op->filter()->w(13);
++  op->stride()->h(17);
++  op->stride()->w(19);
++  op->fusedActivationFunction(luci::FusedActFunc::RELU);
++
++  auto kernel = buildKernel<kernels::AveragePool2D>(op);
++  ASSERT_THAT(kernel, NotNull());
++
++  checkTensor(kernel->input(), input);
++  checkTensor(kernel->output(), op);
++  EXPECT_THAT(kernel->params().padding, Eq(op->padding()));
++  EXPECT_THAT(kernel->params().filter_height, Eq(op->filter()->h()));
++  EXPECT_THAT(kernel->params().filter_width, Eq(op->filter()->w()));
++  EXPECT_THAT(kernel->params().stride_height, Eq(op->stride()->h()));
++  EXPECT_THAT(kernel->params().stride_width, Eq(op->stride()->w()));
++  EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
++}
++
++TEST_F(KernelBuilderTest, Concatenation)
++{
++  auto *input1 = createInputNode();
++  auto *input2 = createInputNode();
++
++  auto *op = createNode<luci::CircleConcatenation>(2);
++  op->values(0, input1);
++  op->values(1, input2);
++  op->axis(11);
++
++  auto kernel = buildKernel<kernels::Concatenation>(op);
++  ASSERT_THAT(kernel, NotNull());
++
++  checkTensor(kernel->input(0), input1);
++  checkTensor(kernel->input(1), input2);
++  checkTensor(kernel->output(), op);
++  EXPECT_THAT(kernel->params().axis, Eq(op->axis()));
++}
++
++TEST_F(KernelBuilderTest, Conv2D)
++{
++  auto *input = createInputNode();
++  auto *filter = createInputNode();
++  auto *bias = createInputNode();
++
++  auto *op = createNode<luci::CircleConv2D>();
++  op->input(input);
++  op->filter(filter);
++  op->bias(bias);
++
++  op->padding(luci::Padding::SAME);
++  op->stride()->h(11);
++  op->stride()->w(13);
++  op->dilation()->h(17);
++  op->dilation()->w(19);
++  op->fusedActivationFunction(luci::FusedActFunc::RELU);
++
++  auto kernel = buildKernel<kernels::Conv2D>(op);
++  ASSERT_THAT(kernel, NotNull());
++
++  checkTensor(kernel->input(), input);
++  checkTensor(kernel->filter(), filter);
++  checkTensor(kernel->bias(), bias);
++  checkTensor(kernel->output(), op);
++  EXPECT_THAT(kernel->params().padding, Eq(op->padding()));
++  EXPECT_THAT(kernel->params().stride_height, Eq(op->stride()->h()));
++  EXPECT_THAT(kernel->params().stride_width, Eq(op->stride()->w()));
++  EXPECT_THAT(kernel->params().dilation_height_factor, Eq(op->dilation()->h()));
++  EXPECT_THAT(kernel->params().dilation_width_factor, Eq(op->dilation()->w()));
++  EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
++}
++
++TEST_F(KernelBuilderTest, DepthToSpace)
++{
++  auto *input = createInputNode();
++
++  auto *op = createNode<luci::CircleDepthToSpace>();
++  op->input(input);
++
++  op->block_size(11);
++
++  auto kernel = buildKernel<kernels::DepthToSpace>(op);
++  ASSERT_THAT(kernel, NotNull());
++
++  checkTensor(kernel->input(), input);
++  checkTensor(kernel->output(), op);
++  EXPECT_THAT(kernel->params().block_size, Eq(op->block_size()));
++}
++
++TEST_F(KernelBuilderTest, DepthwiseConv2D)
++{
++  auto *input = createInputNode();
++  auto *filter = createInputNode();
++  auto *bias = createInputNode();
++
++  auto *op = createNode<luci::CircleDepthwiseConv2D>();
++  op->input(input);
++  op->filter(filter);
++  op->bias(bias);
++
++  op->padding(luci::Padding::SAME);
++  op->depthMultiplier(11);
++  op->stride()->h(13);
++  op->stride()->w(17);
++  op->dilation()->h(19);
++  op->dilation()->w(23);
++  op->fusedActivationFunction(luci::FusedActFunc::RELU);
++
++  auto kernel = buildKernel<kernels::DepthwiseConv2D>(op);
++  ASSERT_THAT(kernel, NotNull());
++
++  checkTensor(kernel->input(), input);
++  checkTensor(kernel->filter(), filter);
++  checkTensor(kernel->bias(), bias);
++  checkTensor(kernel->output(), op);
++  EXPECT_THAT(kernel->params().padding, Eq(op->padding()));
++  EXPECT_THAT(kernel->params().depth_multiplier, Eq(op->depthMultiplier()));
++  EXPECT_THAT(kernel->params().stride_height, Eq(op->stride()->h()));
++  EXPECT_THAT(kernel->params().stride_width, Eq(op->stride()->w()));
++  EXPECT_THAT(kernel->params().dilation_height_factor, Eq(op->dilation()->h()));
++  EXPECT_THAT(kernel->params().dilation_width_factor, Eq(op->dilation()->w()));
++  EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
++}
++
++TEST_F(KernelBuilderTest, Elu)
++{
++  auto *input = createInputNode();
++
++  auto *op = createNode<luci::CircleElu>();
++  op->features(input);
++
++  auto kernel = buildKernel<kernels::Elu>(op);
++  ASSERT_THAT(kernel, NotNull());
++
++  checkTensor(kernel->input(), input);
++  checkTensor(kernel->output(), op);
++}
++
++TEST_F(KernelBuilderTest, FullyConnected)
++{
++  auto *input = createInputNode();
++  auto *weights = createInputNode();
++  auto *bias = createInputNode();
++
++  auto *op = createNode<luci::CircleFullyConnected>();
++  op->input(input);
++  op->weights(weights);
++  op->bias(bias);
++
++  op->fusedActivationFunction(luci::FusedActFunc::RELU);
++
++  auto kernel = buildKernel<kernels::FullyConnected>(op);
++  ASSERT_THAT(kernel, NotNull());
++
++  checkTensor(kernel->input(), input);
++  checkTensor(kernel->weights(), weights);
++  checkTensor(kernel->bias(), bias);
++  checkTensor(kernel->output(), op);
++  EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
++}
++
++TEST_F(KernelBuilderTest, L2Normalize)
++{
++  auto *input = createInputNode();
++
++  auto *op = createNode<luci::CircleL2Normalize>();
++  op->x(input);
++
++  op->fusedActivationFunction(luci::FusedActFunc::RELU);
++
++  auto kernel = buildKernel<kernels::L2Normalize>(op);
++  ASSERT_THAT(kernel, NotNull());
++
++  checkTensor(kernel->input(), input);
++  checkTensor(kernel->output(), op);
++  EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
++}
++
++TEST_F(KernelBuilderTest, L2Pool2D)
++{
++  auto *input = createInputNode();
++
++  auto *op = createNode<luci::CircleL2Pool2D>();
++  op->value(input);
++
++  op->padding(luci::Padding::SAME);
++  op->filter()->h(11);
++  op->filter()->w(13);
++  op->stride()->h(17);
++  op->stride()->w(19);
++  op->fusedActivationFunction(luci::FusedActFunc::RELU);
++
++  auto kernel = buildKernel<kernels::L2Pool2D>(op);
++  ASSERT_THAT(kernel, NotNull());
++
++  checkTensor(kernel->input(), input);
++  checkTensor(kernel->output(), op);
++  EXPECT_THAT(kernel->params().padding, Eq(op->padding()));
++  EXPECT_THAT(kernel->params().filter_height, Eq(op->filter()->h()));
++  EXPECT_THAT(kernel->params().filter_width, Eq(op->filter()->w()));
++  EXPECT_THAT(kernel->params().stride_height, Eq(op->stride()->h()));
++  EXPECT_THAT(kernel->params().stride_width, Eq(op->stride()->w()));
++  EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
++}
++
++TEST_F(KernelBuilderTest, LeakyRelu)
++{
++  auto *input = createInputNode();
++
++  auto *op = createNode<luci::CircleLeakyRelu>();
++  op->features(input);
++
++  op->alpha(11.0f);
++
++  auto kernel = buildKernel<kernels::LeakyRelu>(op);
++  ASSERT_THAT(kernel, NotNull());
++
++  checkTensor(kernel->input(), input);
++  checkTensor(kernel->output(), op);
++  EXPECT_THAT(kernel->params().alpha, Eq(op->alpha()));
++}
++
++TEST_F(KernelBuilderTest, LocalResponseNormalization)
++{
++  auto *input = createInputNode();
++
++  auto *op = createNode<luci::CircleLocalResponseNormalization>();
++  op->input(input);
++
++  op->radius(11);
++  op->bias(13.0f);
++  op->alpha(15.0f);
++  op->beta(17.0f);
++
++  auto kernel = buildKernel<kernels::LocalResponseNormalization>(op);
++  ASSERT_THAT(kernel, NotNull());
++
++  checkTensor(kernel->input(), input);
++  checkTensor(kernel->output(), op);
++  EXPECT_THAT(kernel->params().radius, Eq(op->radius()));
++  EXPECT_THAT(kernel->params().bias, Eq(op->bias()));
++  EXPECT_THAT(kernel->params().alpha, Eq(op->alpha()));
++  EXPECT_THAT(kernel->params().beta, Eq(op->beta()));
++}
++
++TEST_F(KernelBuilderTest, Logistic)
++{
++  auto *input = createInputNode();
++
++  auto *op = createNode<luci::CircleLogistic>();
++  op->x(input);
++
++  auto kernel = buildKernel<kernels::Logistic>(op);
++  ASSERT_THAT(kernel, NotNull());
++
++  checkTensor(kernel->input(), input);
++  checkTensor(kernel->output(), op);
++}
++
++TEST_F(KernelBuilderTest, MaxPool2D)
++{
++  auto *input = createInputNode();
++
++  auto *op = createNode<luci::CircleMaxPool2D>();
++  op->value(input);
++
++  op->padding(luci::Padding::SAME);
++  op->filter()->h(11);
++  op->filter()->w(13);
++  op->stride()->h(17);
++  op->stride()->w(19);
++  op->fusedActivationFunction(luci::FusedActFunc::RELU);
++
++  auto kernel = buildKernel<kernels::MaxPool2D>(op);
++  ASSERT_THAT(kernel, NotNull());
++
++  checkTensor(kernel->input(), input);
++  checkTensor(kernel->output(), op);
++  EXPECT_THAT(kernel->params().padding, Eq(op->padding()));
++  EXPECT_THAT(kernel->params().filter_height, Eq(op->filter()->h()));
++  EXPECT_THAT(kernel->params().filter_width, Eq(op->filter()->w()));
++  EXPECT_THAT(kernel->params().stride_height, Eq(op->stride()->h()));
++  EXPECT_THAT(kernel->params().stride_width, Eq(op->stride()->w()));
++  EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
++}
++
++TEST_F(KernelBuilderTest, Mean)
++{
++  auto *input = createInputNode();
++  auto *axes = createInputNode();
++
++  auto *op = createNode<luci::CircleMean>();
++  op->input(input);
++  op->reduction_indices(axes);
++
++  op->keep_dims(true);
++
++  auto kernel = buildKernel<kernels::Mean>(op);
++  ASSERT_THAT(kernel, NotNull());
++
++  checkTensor(kernel->input(), input);
++  checkTensor(kernel->axes(), axes);
++  checkTensor(kernel->output(), op);
++  EXPECT_THAT(kernel->params().keep_dims, Eq(op->keep_dims()));
++}
++
++TEST_F(KernelBuilderTest, Mul)
++{
++  auto *input1 = createInputNode();
++  auto *input2 = createInputNode();
++
++  auto *op = createNode<luci::CircleMul>();
++  op->x(input1);
++  op->y(input2);
++
++  op->fusedActivationFunction(luci::FusedActFunc::RELU);
++
++  auto kernel = buildKernel<kernels::Mul>(op);
++  ASSERT_THAT(kernel, NotNull());
++
++  checkTensor(kernel->input1(), input1);
++  checkTensor(kernel->input2(), input2);
++  checkTensor(kernel->output(), op);
++  EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
++}
++
++TEST_F(KernelBuilderTest, Pad)
++{
++  auto *input = createInputNode();
++  auto *paddings = createInputNode();
++
++  auto *op = createNode<luci::CirclePad>();
++  op->input(input);
++  op->paddings(paddings);
++
++  auto kernel = buildKernel<kernels::Pad>(op);
++  ASSERT_THAT(kernel, NotNull());
++
++  checkTensor(kernel->input(), input);
++  checkTensor(kernel->paddings(), paddings);
++  checkTensor(kernel->output(), op);
++}
++
++TEST_F(KernelBuilderTest, Reshape)
++{
++  auto *input = createInputNode();
++  auto *shape = createInputNode();
++
++  auto *op = createNode<luci::CircleReshape>();
++  op->tensor(input);
++  op->shape(shape);
++
++  auto kernel = buildKernel<kernels::Reshape>(op);
++  ASSERT_THAT(kernel, NotNull());
++
++  checkTensor(kernel->input(), input);
++  checkTensor(kernel->shape(), shape);
++  checkTensor(kernel->output(), op);
++}
++
++TEST_F(KernelBuilderTest, ReverseV2)
++{
++  auto *input = createInputNode();
++  auto *axes = createInputNode();
++
++  auto *op = createNode<luci::CircleReverseV2>();
++  op->tensor(input);
++  op->axis(axes);
++
++  auto kernel = buildKernel<kernels::Reverse>(op);
++  ASSERT_THAT(kernel, NotNull());
++
++  checkTensor(kernel->input(), input);
++  checkTensor(kernel->axes(), axes);
++  checkTensor(kernel->output(), op);
++}
++
++TEST_F(KernelBuilderTest, Slice)
++{
++  auto *input = createInputNode();
++  auto *begin = createInputNode();
++  auto *size = createInputNode();
++
++  auto *op = createNode<luci::CircleSlice>();
++  op->input(input);
++  op->begin(begin);
++  op->size(size);
++
++  auto kernel = buildKernel<kernels::Slice>(op);
++  ASSERT_THAT(kernel, NotNull());
++
++  checkTensor(kernel->input(), input);
++  checkTensor(kernel->begin(), begin);
++  checkTensor(kernel->size(), size);
++  checkTensor(kernel->output(), op);
++}
++
++TEST_F(KernelBuilderTest, Softmax)
++{
++  auto *input = createInputNode();
++
++  auto *op = createNode<luci::CircleSoftmax>();
++  op->logits(input);
++
++  op->beta(11.0f);
++
++  auto kernel = buildKernel<kernels::Softmax>(op);
++  ASSERT_THAT(kernel, NotNull());
++
++  checkTensor(kernel->input(), input);
++  checkTensor(kernel->output(), op);
++  EXPECT_THAT(kernel->params().beta, Eq(op->beta()));
++}
++
++TEST_F(KernelBuilderTest, SpaceToDepth)
++{
++  auto *input = createInputNode();
++
++  auto *op = createNode<luci::CircleSpaceToDepth>();
++  op->input(input);
++
++  op->block_size(11);
++
++  auto kernel = buildKernel<kernels::SpaceToDepth>(op);
++  ASSERT_THAT(kernel, NotNull());
++
++  checkTensor(kernel->input(), input);
++  checkTensor(kernel->output(), op);
++  EXPECT_THAT(kernel->params().block_size, op->block_size());
++}
++
++TEST_F(KernelBuilderTest, Split)
++{
++  auto *axis = createInputNode();
++  auto *input = createInputNode();
++  auto *op = createNode<luci::CircleSplit>();
++  auto *output1 = createNodeOut<luci::CircleSplitOut>(op, 0);
++  auto *output2 = createNodeOut<luci::CircleSplitOut>(op, 1);
++
++  op->split_dim(axis);
++  op->input(input);
++
++  op->num_split(2);
++
++  auto kernel = buildKernel<kernels::Split>(op);
++  ASSERT_THAT(kernel, NotNull());
++
++  checkTensor(kernel->axis(), axis);
++  checkTensor(kernel->input(), input);
++  checkTensor(kernel->output(0), output1);
++  checkTensor(kernel->output(1), output2);
++}
++
++TEST_F(KernelBuilderTest, Squeeze)
++{
++  auto *input = createInputNode();
++
++  auto *op = createNode<luci::CircleSqueeze>();
++  op->input(input);
++
++  op->squeeze_dims({11, 13});
++
++  auto kernel = buildKernel<kernels::Squeeze>(op);
++  ASSERT_THAT(kernel, NotNull());
++
++  checkTensor(kernel->input(), input);
++  checkTensor(kernel->output(), op);
++  EXPECT_THAT(kernel->params().squeeze_dims, ElementsAreArray(op->squeeze_dims()));
++}
++
++TEST_F(KernelBuilderTest, StridedSlice)
++{
++  auto *input = createInputNode();
++  auto *begin = createInputNode();
++  auto *end = createInputNode();
++  auto *strides = createInputNode();
++
++  auto *op = createNode<luci::CircleStridedSlice>();
++  op->input(input);
++  op->begin(begin);
++  op->end(end);
++  op->strides(strides);
++
++  op->begin_mask(11);
++  op->ellipsis_mask(13);
++  op->end_mask(17);
++  op->new_axis_mask(19);
++  op->shrink_axis_mask(23);
++
++  auto kernel = buildKernel<kernels::StridedSlice>(op);
++  ASSERT_THAT(kernel, NotNull());
++
++  checkTensor(kernel->input(), input);
++  checkTensor(kernel->begin(), begin);
++  checkTensor(kernel->end(), end);
++  checkTensor(kernel->strides(), strides);
++  checkTensor(kernel->output(), op);
++  EXPECT_THAT(kernel->params().begin_mask, Eq(op->begin_mask()));
++  EXPECT_THAT(kernel->params().ellipsis_mask, Eq(op->ellipsis_mask()));
++  EXPECT_THAT(kernel->params().end_mask, Eq(op->end_mask()));
++  EXPECT_THAT(kernel->params().new_axis_mask, Eq(op->new_axis_mask()));
++  EXPECT_THAT(kernel->params().shrink_axis_mask, Eq(op->shrink_axis_mask()));
++}
++
++TEST_F(KernelBuilderTest, Transpose)
++{
++  auto *input = createInputNode();
++  auto *perm = createInputNode();
++
++  auto *op = createNode<luci::CircleTranspose>();
++  op->a(input);
++  op->perm(perm);
++
++  auto kernel = buildKernel<kernels::Transpose>(op);
++  ASSERT_THAT(kernel, NotNull());
++
++  checkTensor(kernel->input(), input);
++  checkTensor(kernel->perm(), perm);
++  checkTensor(kernel->output(), op);
++}
++
++TEST_F(KernelBuilderTest, TransposeConv)
++{
++  auto *output_shape = createInputNode();
++  auto *filter = createInputNode();
++  auto *input = createInputNode();
++
++  auto *op = createNode<luci::CircleTransposeConv>();
++  op->inputSizes(output_shape);
++  op->filter(filter);
++  op->outBackprop(input);
++
++  op->padding(luci::Padding::SAME);
++  op->stride()->h(11);
++  op->stride()->w(13);
++
++  auto kernel = buildKernel<kernels::TransposeConv>(op);
++  ASSERT_THAT(kernel, NotNull());
++
++  checkTensor(kernel->output_shape(), output_shape);
++  checkTensor(kernel->filter(), filter);
++  checkTensor(kernel->input(), input);
++  checkTensor(kernel->output(), op);
++  EXPECT_THAT(kernel->params().padding, Eq(op->padding()));
++  EXPECT_THAT(kernel->params().stride_height, Eq(op->stride()->h()));
++  EXPECT_THAT(kernel->params().stride_width, Eq(op->stride()->w()));
++}
++
++TEST_F(KernelBuilderTest, Unpack)
++{
++  auto *input = createInputNode();
++  auto *op = createNode<luci::CircleUnpack>();
++  auto *output1 = createNodeOut<luci::CircleUnpackOut>(op, 0);
++  auto *output2 = createNodeOut<luci::CircleUnpackOut>(op, 1);
++
++  op->value(input);
++
++  op->num(2);
++  op->axis(11);
++
++  auto kernel = buildKernel<kernels::Unpack>(op);
++  ASSERT_THAT(kernel, NotNull());
++
++  checkTensor(kernel->input(), input);
++  checkTensor(kernel->output(0), output1);
++  checkTensor(kernel->output(1), output2);
++  EXPECT_THAT(kernel->params().axis, Eq(op->axis()));
++}
++
++TEST_F(KernelBuilderTest, NonExisting1_NEG)
++{
++  auto *op = createNode<luci::CircleConst>();
++  ASSERT_ANY_THROW(buildKernel<Kernel>(op));
++}
++
++TEST_F(KernelBuilderTest, NonExisting2_NEG)
++{
++  auto *op = createNode<luci::CircleInput>();
++  ASSERT_ANY_THROW(buildKernel<Kernel>(op));
++}
++
++TEST_F(KernelBuilderTest, NonExisting3_NEG)
++{
++  auto *op = createNode<luci::CircleOutput>();
++  ASSERT_ANY_THROW(buildKernel<Kernel>(op));
++}
++
++} // namespace
++} // namespace luci_interpreter
+diff --git a/compiler/luci-interpreter/src/loader/ModuleLoader.cpp b/compiler/luci-interpreter/src/loader/ModuleLoader.cpp
+index 7780a61..b9a2ae0 100644
+--- a/compiler/luci-interpreter/src/loader/ModuleLoader.cpp
++++ b/compiler/luci-interpreter/src/loader/ModuleLoader.cpp
+@@ -41,8 +41,11 @@ void ModuleLoader::load()
+   {
+     const loco::Graph *graph = _module->graph(i);
+     RuntimeGraph *runtime_graph = _graph_to_runtime_graph.at(graph);
+-    GraphLoader loader(*this, graph, runtime_graph, _runtime_to_ir, _node_to_tensor);
+-    loader.load();
++    GraphLoader loader(graph, runtime_graph, _runtime_to_ir, _graph_to_runtime_graph,
++                       _node_to_tensor);
++    loader.loadTensors();
++    loader.initInputOutputTensors();
++    loader.loadOperators();
+   }
+ }
+ 
+diff --git a/compiler/luci-interpreter/src/loader/ModuleLoader.h b/compiler/luci-interpreter/src/loader/ModuleLoader.h
+index 954dbfb..1af0ed7 100644
+--- a/compiler/luci-interpreter/src/loader/ModuleLoader.h
++++ b/compiler/luci-interpreter/src/loader/ModuleLoader.h
+@@ -36,11 +36,6 @@ public:
+ 
+   void load();
+ 
+-  RuntimeGraph *getRuntimeGraph(const loco::Graph *graph) const
+-  {
+-    return _graph_to_runtime_graph.at(graph);
+-  }
+-
+ private:
+   const luci::Module *_module;
+   RuntimeModule *_runtime_module;
+diff --git a/compiler/luci-value-test/evalverify.sh b/compiler/luci-value-test/evalverify.sh
+index dfd55a6..12c9a45 100755
+--- a/compiler/luci-value-test/evalverify.sh
++++ b/compiler/luci-value-test/evalverify.sh
+@@ -4,8 +4,10 @@
+ #
+ # HOW TO USE
+ #
+-# ./evalverify.sh <path/to/work_dir> <TEST 1> <TEST 2> ...
+-# work_dir : build directory of luci-value-test (ex: build/compiler/luci-value-test)
++# ./evalverify.sh <path/to/bin_dir> <path/to/work_dir> <path/to/venv_dir> <TEST 1> <TEST 2> ...
++# bin_dir  : build directory of luci-value-test (ex: build/compiler/luci-value-test)
++# work_dir : artifacts directoy where test materials exist
++# venv_dir : python virtual environment home directory
+ 
+ VERIFY_SOURCE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ VERIFY_SCRIPT_PATH="${VERIFY_SOURCE_PATH}/luci_eval_verifier.py"
+diff --git a/compiler/luci-value-test/test.lst b/compiler/luci-value-test/test.lst
+index 6a332f9..364d881 100644
+--- a/compiler/luci-value-test/test.lst
++++ b/compiler/luci-value-test/test.lst
+@@ -1,6 +1,8 @@
+ #addeval(Abs_000)
+ addeval(Add_000)
++#addeval(Add_001)
+ addeval(Add_U8_000)
++#addeval(AddN_000)
+ #addeval(ArgMax_000)
+ #addeval(ArgMax_001)
+ #addeval(ArgMax_002)
+@@ -9,73 +11,173 @@ addeval(Add_U8_000)
+ #addeval(ArgMax_U8_001)
+ #addeval(ArgMax_U8_002)
+ #addeval(ArgMax_U8_003)
++#addeval(ArgMin_000)
++#addeval(ArgMin_001)
++#addeval(ArgMin_002)
++#addeval(ArgMin_003)
++#addeval(ArgMin_U8_000)
++#addeval(ArgMin_U8_001)
++#addeval(ArgMin_U8_002)
++#addeval(ArgMin_U8_003)
+ addeval(AveragePool2D_000)
++#addeval(BatchMatMul_000)
+ #addeval(BatchMatMulV2_000)
+ #addeval(BatchMatMulV2_001)
+ #addeval(BatchToSpaceND_000)
+ #addeval(Cast_000)
++#addeval(Cast_001)
++#addeval(Ceil_000)
+ addeval(Concatenation_000)
+ addeval(Concatenation_U8_000)
+ addeval(Conv2D_000)
+ addeval(Conv2D_001)
+ addeval(Conv2D_002)
++#addeval(Conv2D_003)
+ addeval(Conv2D_U8_000)
+ addeval(Conv2D_U8_001)
+ #addeval(Cos_000)
++#addeval(DepthToSpace_000)
+ addeval(DepthwiseConv2D_000)
+ addeval(DepthwiseConv2D_U8_000)
++#addeval(DepthwiseConv2D_U8_001)
++addeval(DepthwiseConv2D_001)
+ #addeval(Div_000)
++#addeval(ELU_000)
+ #addeval(Equal_000)
+ #addeval(Exp_000)
++#addeval(ExpandDims_000)
++#addeval(ExpandDims_001)
++#addeval(ExpandDims_002)
++#addeval(ExpandDims_003)
++#addeval(Fill_000)
++#addeval(Fill_001)
++#addeval(Floor_000)
++#addeval(FloorDiv_000)
++#addeval(FloorDiv_001)
++#addeval(FloorMod_000)
++#addeval(FloorMod_001)
+ addeval(FullyConnected_000)
+ addeval(FullyConnected_001)
+ #addeval(FullyConnected_002)
+ #addeval(FullyConnected_U8_000)
+ #addeval(Gather_000)
++#addeval(GatherNd_000)
++#addeval(Greater_000)
++#addeval(GreaterEqual_000)
+ #addeval(If_000)
+ #addeval(If_001)
++addeval(L2Normalize_000)
++addeval(L2Pool2D_000)
++#addeval(L2Pool2D_U8_000)
++#addeval(LeakyRelu_000)
++#addeval(Less_000)
++#addeval(LessEqual_000)
++#addeval(LocalResponseNormalization_000)
++#addeval(Log_000)
++#addeval(LogicalAnd_000)
+ #addeval(LogicalNot_000)
+ #addeval(LogicalOr_000)
+-#addeval(Logistic_000)
++addeval(Logistic_000)
++#addeval(LogSoftmax_000)
++#addeval(MatMul_000)
++#addeval(MatrixDiag_000)
++#addeval(MatrixSetDiag_000)
++#addeval(Maximum_000)
+ addeval(MaxPool2D_000)
+ addeval(MaxPool2D_U8_000)
+ addeval(Mean_000)
+ addeval(Mean_001)
+ addeval(Mean_U8_000)
++#addeval(Minimum_000)
++#addeval(MirrorPad_000)
+ addeval(Mul_000)
+ #addeval(Mul_U8_000)
++#addeval(Neg_000)
++#addeval(NotEqual_000)
++#addeval(OneHot_000)
++#addeval(OneHot_001)
++#addeval(OneHot_002)
++#addeval(OneHot_003)
+ #addeval(Pack_000)
+ #addeval(Pack_U8_000)
+ addeval(Pad_000)
+ addeval(Pad_U8_000)
++#addeval(Pow_000)
++#addeval(PRelu_000)
++#addeval(Range_000)
++#addeval(Rank_000)
++#addeval(ReduceAny_000)
++#addeval(ReduceAny_001)
++#addeval(ReduceAny_002)
++#addeval(ReduceAny_003)
++#addeval(ReduceMax_000)
++#addeval(ReduceMin_000)
+ #addeval(ReduceProd_000)
+ #addeval(ReduceProd_001)
+ #addeval(ReduceProd_002)
+ #addeval(ReduceProd_003)
+ #addeval(ReLU_000)
++#addeval(ReLU6_000)
++#addeval(ReLUN1To1_000)
+ addeval(Reshape_000)
+ addeval(Reshape_001)
+ addeval(Reshape_002)
+ #addeval(Reshape_003)
+ addeval(Reshape_U8_000)
++#addeval(ResizeBilinear_000)
++#addeval(ResizeNearestNeighbor_000)
++#addeval(ReverseSequence_000)
++#addeval(ReverseV2_000)
++#addeval(Round_000)
+ #addeval(Rsqrt_000)
++#addeval(ScatterNd_000)
++#addeval(SegmentSum_000)
++#addeval(Select_000)
++#addeval(Select_001)
++#addeval(Select_002)
++#addeval(SelectV2_000)
++#addeval(SelectV2_001)
++#addeval(SelectV2_002)
++#addeval(Shape_000)
+ #addeval(Sin_000)
++addeval(Slice_000)
+ addeval(Softmax_000)
+ #addeval(Softmax_U8_000)
+ #addeval(SpaceToBatchND_000)
+ #addeval(SpaceToBatchND_001)
+ #addeval(SpaceToBatchND_002)
+ #addeval(SpaceToBatchND_003)
+-#addeval(StridedSlice_000)
+-#addeval(StridedSlice_001)
++#addeval(SpaceToDepth_000)
++#addeval(SparseToDense_000)
++#addeval(Split_000)
++#addeval(SplitV_000)
++#addeval(Sqrt_000)
++#addeval(Square_000)
++#addeval(SquaredDifference_000)
++addeval(Squeeze_000)
++addeval(StridedSlice_000)
++addeval(StridedSlice_001)
++addeval(StridedSlice_002)
+ #addeval(Sub_000)
+ #addeval(Sub_U8_000)
++#addeval(Sum_000)
++#addeval(Sum_001)
+ #addeval(Tanh_000)
+ #addeval(Tile_000)
+ #addeval(Tile_U8_000)
+-#addeval(Transpose_000)
++#addeval(TopKV2_000)
++#addeval(TopKV2_001)
++addeval(Transpose_000)
++#addeval(TransposeConv_000)
+ #addeval(Unpack_000)
+ #addeval(Unpack_001)
+ #addeval(Unpack_002)
++addeval(Unpack_003)
++#addeval(Where_000)
++#addeval(Where_001)
+ #addeval(While_000)
+ #addeval(While_001)
++#addeval(While_002)
++#addeval(While_003)
++#addeval(YUV_TO_RGB_U8_000)
++#addeval(ZerosLike_000)
+diff --git a/compiler/luci/export/src/CircleOperationExporter.cpp b/compiler/luci/export/src/CircleOperationExporter.cpp
+index 3c01b67..344c99f 100644
+--- a/compiler/luci/export/src/CircleOperationExporter.cpp
++++ b/compiler/luci/export/src/CircleOperationExporter.cpp
+@@ -890,7 +890,7 @@ void OperationExporter::visit(luci::CircleSpaceToDepth *node)
+ {
+   export_simple(node, circle::BuiltinOperator_SPACE_TO_DEPTH,
+                 circle::BuiltinOptions_SpaceToDepthOptions,
+-                CreateSpaceToDepthOptions(builder).Union());
++                CreateSpaceToDepthOptions(builder, node->block_size()).Union());
+ }
+ 
+ void OperationExporter::visit(luci::CircleSparseToDense *node)
+diff --git a/compiler/luci/export/src/CircleTensorExporter.cpp b/compiler/luci/export/src/CircleTensorExporter.cpp
+index 5cad392..dc8c2fb 100644
+--- a/compiler/luci/export/src/CircleTensorExporter.cpp
++++ b/compiler/luci/export/src/CircleTensorExporter.cpp
+@@ -302,7 +302,10 @@ encodeQuantizationParameters(FlatBufferBuilder &builder, luci::CircleQuantParam
+     scale = builder.CreateVector(quantparam->scale);
+     zero_point = builder.CreateVector(quantparam->zerop);
+   }
+-  return circle::CreateQuantizationParameters(builder, min, max, scale, zero_point);
++  // Note: QuantizationDetails is not supported
++  return circle::CreateQuantizationParameters(builder, min, max, scale, zero_point,
++                                              circle::QuantizationDetails::QuantizationDetails_NONE,
++                                              0, quantparam->quantized_dimension);
+ }
+ 
+ void exportOpDefinedTensor(const CircleTensoInfo &info, FlatBufferBuilder &builder,
+diff --git a/compiler/luci/import/src/CircleReader.cpp b/compiler/luci/import/src/CircleReader.cpp
+index 81e945d..bc7f397 100644
+--- a/compiler/luci/import/src/CircleReader.cpp
++++ b/compiler/luci/import/src/CircleReader.cpp
+@@ -156,6 +156,7 @@ luci_quantparam(const circle::QuantizationParametersT *quantization)
+   const auto &max = quantization->max;
+   const auto &scale = quantization->scale;
+   const auto &zero_point = quantization->zero_point;
++  const auto &quantized_dimension = quantization->quantized_dimension;
+ 
+   if ((!min.empty() && !max.empty()) || (!scale.empty() && !zero_point.empty()))
+   {
+@@ -165,6 +166,7 @@ luci_quantparam(const circle::QuantizationParametersT *quantization)
+     quantparam->max = max;
+     quantparam->scale = scale;
+     quantparam->zerop = zero_point;
++    quantparam->quantized_dimension = quantized_dimension;
+ 
+     return quantparam;
+   }
+diff --git a/compiler/luci/import/src/Importer.test.cpp b/compiler/luci/import/src/Importer.test.cpp
+index 4426e15..8366546 100644
+--- a/compiler/luci/import/src/Importer.test.cpp
++++ b/compiler/luci/import/src/Importer.test.cpp
+@@ -20,4 +20,9 @@
+ 
+ #include <gtest/gtest.h>
+ 
+-TEST(TensorFlowLiteImport, Dummy) { luci::Importer import; }
++TEST(TensorFlowLiteImport, Dummy)
++{
++  luci::Importer import;
++
++  SUCCEED();
++}
+diff --git a/compiler/luci/import/src/Nodes/CircleLogistic.cpp b/compiler/luci/import/src/Nodes/CircleLogistic.cpp
+index 85e7e55..c77c55e 100644
+--- a/compiler/luci/import/src/Nodes/CircleLogistic.cpp
++++ b/compiler/luci/import/src/Nodes/CircleLogistic.cpp
+@@ -32,21 +32,7 @@ bool CircleLogisticGraphBuilder::validate(const ValidateArgs &args) const
+   if (outputs.size() != 1)
+     return false;
+ 
+-  // Must be one of the following types
+-  // float16, float32, float64, complex64, or complex128
+   const auto &tensors = args.reader.tensors();
+-  const auto &tensor = tensors.at(inputs[0]);
+-  switch (tensor->type)
+-  {
+-    case circle::TensorType_FLOAT16:
+-    case circle::TensorType_FLOAT32:
+-    case circle::TensorType_FLOAT64:
+-    case circle::TensorType_COMPLEX64:
+-      break;
+-    default:
+-      return false;
+-  }
+-
+   if (tensors.at(inputs[0])->type != tensors.at(outputs[0])->type)
+     return false;
+ 
+diff --git a/compiler/luci/import/src/Nodes/CircleTransposeConv.cpp b/compiler/luci/import/src/Nodes/CircleTransposeConv.cpp
+index 7bdf46d..eb0956c 100644
+--- a/compiler/luci/import/src/Nodes/CircleTransposeConv.cpp
++++ b/compiler/luci/import/src/Nodes/CircleTransposeConv.cpp
+@@ -30,6 +30,24 @@ bool CircleTransposeConvGraphBuilder::validate(const ValidateArgs &args) const
+   if (args.op.inputs.size() != 3)
+     return false;
+ 
++  const auto &inputs = args.op.inputs;
++  const auto &tensors = args.reader.tensors();
++  const auto &filter_tensor = tensors.at(inputs[1]);
++  const auto &filter_shape = filter_tensor.get()->shape;
++  const auto &ifm_tensor = tensors.at(inputs[2]);
++  const auto &ifm_shape = ifm_tensor.get()->shape;
++
++  // ifm and filters must be 4-D tensor
++  if (ifm_shape.size() != 4)
++    return false;
++  if (filter_shape.size() != 4)
++    return false;
++
++  // input shape : [batch, height, width, in_channels]
++  // filters shape : [output_channels, height, weight, in_channels]
++  if (ifm_tensor.get()->shape.at(3) != filter_tensor.get()->shape.at(3))
++    return false;
++
+   return true;
+ }
+ 
+diff --git a/compiler/luci/lang/include/luci/IR/CircleNodes.lst b/compiler/luci/lang/include/luci/IR/CircleNodes.lst
+index 488dcfb..acd7921 100644
+--- a/compiler/luci/lang/include/luci/IR/CircleNodes.lst
++++ b/compiler/luci/lang/include/luci/IR/CircleNodes.lst
+@@ -120,6 +120,7 @@ CIRCLE_NODE(BCQ_FULLY_CONNECTED, luci::CircleBCQFullyConnected)
+ CIRCLE_NODE(BCQ_GATHER, luci::CircleBCQGather)
+ CIRCLE_NODE(INSTANCE_NORM, luci::CircleInstanceNorm)
+ // Virtual node(s)
++CIRCLE_NODE(CIRCLECONST, void)
+ CIRCLE_NODE(CIRCLEINPUT, luci::CircleInput)
+ CIRCLE_NODE(CIRCLEOUTPUT, luci::CircleOutput)
+ CIRCLE_NODE(CIRCLEOUTPUTDUMMY, luci::CircleOutputDummy)
+diff --git a/compiler/luci/lang/include/luci/IR/CircleQuantParam.h b/compiler/luci/lang/include/luci/IR/CircleQuantParam.h
+index 7253e65..6944373 100644
+--- a/compiler/luci/lang/include/luci/IR/CircleQuantParam.h
++++ b/compiler/luci/lang/include/luci/IR/CircleQuantParam.h
+@@ -29,6 +29,7 @@ struct CircleQuantParam
+   std::vector<float> max;
+   std::vector<float> scale;
+   std::vector<int64_t> zerop;
++  int32_t quantized_dimension{0};
+ };
+ 
+ } // namespace luci
+diff --git a/compiler/luci/lang/src/Module.test.cpp b/compiler/luci/lang/src/Module.test.cpp
+index 26bf073..a5973e5 100644
+--- a/compiler/luci/lang/src/Module.test.cpp
++++ b/compiler/luci/lang/src/Module.test.cpp
+@@ -22,7 +22,7 @@ TEST(ModuleTest, consturctor)
+ {
+   auto gs = luci::make_module();
+ 
+-  GTEST_SUCCEED();
++  SUCCEED();
+ }
+ 
+ TEST(ModuleTest, add)
+diff --git a/compiler/luci/lang/src/Nodes/CircleCustom.test.cpp b/compiler/luci/lang/src/Nodes/CircleCustom.test.cpp
+index 74ea82c..c07268c 100644
+--- a/compiler/luci/lang/src/Nodes/CircleCustom.test.cpp
++++ b/compiler/luci/lang/src/Nodes/CircleCustom.test.cpp
+@@ -35,7 +35,12 @@ TEST(CircleCustomTest, constructor)
+   ASSERT_EQ(0, custom_node.custom_code().size());
+ }
+ 
+-TEST(CircleCustomTest, constructor_NEG) { ASSERT_DEBUG_DEATH(luci::CircleCustom{0}, ""); }
++TEST(CircleCustomTest, constructor_NEG)
++{
++  ASSERT_DEBUG_DEATH(luci::CircleCustom{0}, "");
++
++  SUCCEED();
++}
+ 
+ TEST(CircleCustomTest, invalidIndex_NEG)
+ {
+diff --git a/compiler/luci/lang/src/Nodes/CircleIf.test.cpp b/compiler/luci/lang/src/Nodes/CircleIf.test.cpp
+index e3c8c9f..35f28e9 100644
+--- a/compiler/luci/lang/src/Nodes/CircleIf.test.cpp
++++ b/compiler/luci/lang/src/Nodes/CircleIf.test.cpp
+@@ -41,11 +41,15 @@ TEST(CircleIfTest, constructor)
+ TEST(CircleIfTestDeath, invalid_arity_NEG)
+ {
+   ASSERT_DEBUG_DEATH(luci::CircleIf very_long_name_if_node(0, 1), "");
++
++  SUCCEED();
+ }
+ 
+ TEST(CircleIfTestDeath, invalid_output_count_NEG)
+ {
+   ASSERT_DEBUG_DEATH(luci::CircleIf if_node(2, 0), "");
++
++  SUCCEED();
+ }
+ 
+ TEST(CircleIfTestDeath, invalid_input_get_index_NEG)
+diff --git a/compiler/luci/lang/src/Nodes/CircleWhile.test.cpp b/compiler/luci/lang/src/Nodes/CircleWhile.test.cpp
+index 19290c0..913686f 100644
+--- a/compiler/luci/lang/src/Nodes/CircleWhile.test.cpp
++++ b/compiler/luci/lang/src/Nodes/CircleWhile.test.cpp
+@@ -41,11 +41,15 @@ TEST(CircleWhileTest, constructor)
+ TEST(CircleWhileTestDeath, invalid_arity_NEG)
+ {
+   ASSERT_DEBUG_DEATH(luci::CircleWhile very_long_name_while_node(0, 1), "");
++
++  SUCCEED();
+ }
+ 
+ TEST(CircleWhileTestDeath, invalid_output_count_NEG)
+ {
+   ASSERT_DEBUG_DEATH(luci::CircleWhile while_node(2, 0), "");
++
++  SUCCEED();
+ }
+ 
+ TEST(CircleWhileTestDeath, invalid_input_get_index_NEG)
+diff --git a/compiler/luci/pass/src/CircleOptimizer.cpp b/compiler/luci/pass/src/CircleOptimizer.cpp
+index 90fbe90..2edf7a9 100644
+--- a/compiler/luci/pass/src/CircleOptimizer.cpp
++++ b/compiler/luci/pass/src/CircleOptimizer.cpp
+@@ -145,7 +145,7 @@ void CircleOptimizer::quantize(loco::Graph *g) const
+   {
+     static const std::vector<std::string> fakeq_supported_input_dtype{"float32"};
+     static const std::vector<std::string> fakeq_supported_output_dtype{"uint8"};
+-    static const std::vector<std::string> fakeq_supported_granularity{"layer"};
++    static const std::vector<std::string> fakeq_supported_granularity{"layer", "channel"};
+ 
+     auto input_dtype = _options->param(Options::AlgorithmParameters::Quantize_input_dtype);
+     auto output_dtype = _options->param(Options::AlgorithmParameters::Quantize_output_dtype);
+@@ -173,7 +173,7 @@ void CircleOptimizer::quantize(loco::Graph *g) const
+   {
+     static const std::vector<std::string> qwmm_supported_input_dtype{"float32"};
+     static const std::vector<std::string> qwmm_supported_output_dtype{"uint8"};
+-    static const std::vector<std::string> qwmm_supported_granularity{"layer"};
++    static const std::vector<std::string> qwmm_supported_granularity{"layer", "channel"};
+ 
+     auto input_dtype = _options->param(Options::AlgorithmParameters::Quantize_input_dtype);
+     auto output_dtype = _options->param(Options::AlgorithmParameters::Quantize_output_dtype);
+diff --git a/compiler/luci/pass/src/FuseBCQPass.cpp b/compiler/luci/pass/src/FuseBCQPass.cpp
+index b81db88..edbaefa 100644
+--- a/compiler/luci/pass/src/FuseBCQPass.cpp
++++ b/compiler/luci/pass/src/FuseBCQPass.cpp
+@@ -67,14 +67,190 @@ const std::string node_name_prefix(luci::NodeName node_name)
+   return prefix;
+ }
+ 
++/**
++ * @brief Create CircleOutputExclude operation, which has same shape and dtype with
++ *        original circle_node.
++ */
++luci::CircleOutputExclude *createNoOp(luci::CircleNode *circle_node)
++{
++  auto graph = circle_node->graph();
++  auto noOp = graph->nodes()->create<luci::CircleOutputExclude>();
++
++  if (circle_node->shape_status() == luci::ShapeStatus::VALID)
++  {
++    noOp->dtype(circle_node->dtype());
++    noOp->rank(circle_node->rank());
++    for (uint32_t i = 0; i < circle_node->rank(); ++i)
++      noOp->dim(i) = circle_node->dim(i);
++  }
++  else
++  {
++    // For type inference
++    noOp->dtype(loco::DataType::FLOAT32);
++  }
++
++  return noOp;
++};
++
+ } // namespace
+ 
+ namespace
+ {
+ 
+-class BCQConverter final
++// V means the version of BCQ.
++template <int32_t V> class BCQFuser;
++
++template <> class BCQFuser<1>
+ {
+ public:
++  bool fuseBCQ(loco::Graph *g)
++  {
++    bool changed = false;
++
++    for (auto node : loco::all_nodes(g))
++    {
++      if (auto circle_const = dynamic_cast<luci::CircleConst *>(node))
++      {
++        add_BCQ_info_node(circle_const);
++      }
++    }
++
++    if (!is_bcqinfo_valid())
++      return false;
++
++    for (auto node : loco::active_nodes(loco::output_nodes(g)))
++    {
++      if (auto gather = dynamic_cast<luci::CircleGather *>(node))
++      {
++        auto params = dynamic_cast<luci::CircleConst *>(gather->params());
++        if (params != nullptr && has_BCQ_info(params))
++        {
++          auto bcq_gather = g->nodes()->create<luci::CircleBCQGather>();
++
++          bcq_gather->op_version(1);
++          bcq_gather->input_scales(get_alpha(params));
++          bcq_gather->input_binary(get_packed_binary_code(params));
++          bcq_gather->indices(gather->indices());
++          bcq_gather->input_clusters(packed_clusters(params));
++
++          // input_binary shape : [output_size, hidden_size]
++          const auto binary_hidden_size =
++              loco::must_cast<luci::CircleConst *>(bcq_gather->input_binary())->dim(1).value() * 32;
++          bcq_gather->input_hidden_size(binary_hidden_size);
++
++          if (do_w_x(params))
++          {
++            bcq_gather->axis(gather->axis());
++          }
++          else
++          {
++            const auto axis_transpose = (gather->axis() == 0) ? 1 : 0;
++            bcq_gather->axis(axis_transpose);
++          }
++
++          loco::replace(gather).with(bcq_gather);
++
++          changed = true;
++        }
++      }
++      else if (auto fully_connected = dynamic_cast<luci::CircleFullyConnected *>(node))
++      {
++        auto weights = dynamic_cast<luci::CircleConst *>(fully_connected->weights());
++        if (weights != nullptr && has_BCQ_info(weights))
++        {
++          auto bcq_fc = g->nodes()->create<luci::CircleBCQFullyConnected>();
++
++          bcq_fc->op_version(1);
++          bcq_fc->weights_scales(get_alpha(weights));
++          bcq_fc->weights_binary(get_packed_binary_code(weights));
++          bcq_fc->bias(fully_connected->bias());
++          bcq_fc->weights_clusters(packed_clusters(weights));
++          bcq_fc->fusedActivationFunction(fully_connected->fusedActivationFunction());
++
++          loco::Node *bcq_input = fully_connected->input();
++          int32_t batch_rank = 0;
++
++          // If input of BCQFullyConnected has more than rank 2, we should reshape it as rank 2
++          const auto original_input = loco::must_cast<luci::CircleNode *>(fully_connected->input());
++          if (original_input->shape_status() == luci::ShapeStatus::VALID &&
++              original_input->rank() > 2)
++          {
++            auto new_shape = g->nodes()->create<luci::CircleConst>();
++            new_shape->dtype(loco::DataType::S32);
++            new_shape->size<loco::DataType::S32>(2);
++            new_shape->rank(1);
++            new_shape->dim(0) = 2;
++
++            auto batch_size = 1;
++            for (uint32_t i = 0; i < original_input->rank() - 1; ++i)
++              batch_size *= original_input->dim(i).value();
++
++            new_shape->at<loco::DataType::S32>(0) = batch_size;
++            new_shape->at<loco::DataType::S32>(1) =
++                original_input->dim(original_input->rank() - 1).value();
++            new_shape->shape_status(luci::ShapeStatus::VALID);
++
++            auto reshape = g->nodes()->create<luci::CircleReshape>();
++            reshape->tensor(original_input);
++            reshape->shape(new_shape);
++
++            bcq_input = reshape;
++            batch_rank = original_input->rank() - 2;
++          }
++
++          // If x_w formation, we should insert Transpose in front and back of BCQFullyConnected
++          if (do_w_x(weights))
++          {
++            const auto binary_hidden_size =
++                loco::must_cast<luci::CircleNode *>(fully_connected->input())
++                    ->dim(batch_rank)
++                    .value();
++            bcq_fc->weights_hidden_size(binary_hidden_size);
++            bcq_fc->input(bcq_input);
++            loco::replace(fully_connected).with(bcq_fc);
++          }
++          else
++          {
++            const auto binary_hidden_size =
++                loco::must_cast<luci::CircleNode *>(fully_connected->input())
++                    ->dim(1 + batch_rank)
++                    .value();
++            bcq_fc->weights_hidden_size(binary_hidden_size);
++
++            auto perm = g->nodes()->create<luci::CircleConst>();
++            perm->dtype(loco::DataType::S32);
++            perm->size<loco::DataType::S32>(2);
++            perm->rank(1);
++            perm->dim(0) = 2;
++            perm->at<loco::DataType::S32>(0) = 1;
++            perm->at<loco::DataType::S32>(1) = 0;
++            perm->shape_status(luci::ShapeStatus::VALID);
++
++            auto input_transpose = g->nodes()->create<luci::CircleTranspose>();
++            input_transpose->a(bcq_input);
++            input_transpose->perm(perm);
++
++            bcq_fc->input(input_transpose);
++
++            auto output_transpose = g->nodes()->create<luci::CircleTranspose>();
++            output_transpose->a(bcq_fc);
++            output_transpose->perm(perm);
++
++            loco::replace(fully_connected).with(output_transpose);
++          }
++
++          changed = true;
++        }
++      }
++    }
++
++    if (changed)
++      clear_BCQ_nodes();
++
++    return changed;
++  }
++
++private:
+   void add_BCQ_info_node(luci::CircleConst *node)
+   {
+     const auto node_name = node->name();
+@@ -119,16 +295,65 @@ public:
+     return has_info;
+   }
+ 
++  /**
++   * @brief Exclude BCQ information nodes which are used for fusing BCQ operations
++   *        from graph output by using CircleOutputExclude
++   */
++  void clear_BCQ_nodes()
++  {
++    auto clear_nodes = [](std::map<std::string, luci::CircleConst *> &nodes) {
++      for (auto &n : nodes)
++      {
++        auto node = n.second;
++
++        for (auto s : loco::succs(node))
++        {
++          if (auto outnode = dynamic_cast<luci::CircleOutput *>(s))
++          {
++            outnode->from(createNoOp(node));
++          }
++          else if (auto reshape_node = dynamic_cast<luci::CircleReshape *>(s))
++          {
++            for (auto o : loco::succs(reshape_node))
++            {
++              auto circle_output = loco::must_cast<luci::CircleOutput *>(o);
++              circle_output->from(createNoOp(reshape_node));
++            }
++          }
++        }
++      }
++    };
++
++    clear_nodes(_do_w_x);
++    clear_nodes(_alpha);
++    clear_nodes(_packed_binary_code);
++    clear_nodes(_number_of_clusters);
++    clear_nodes(_size_of_clusters);
++    clear_nodes(_qbits_of_clusters);
++    clear_nodes(_dequant_weight);
++  }
++
++  bool is_bcqinfo_valid()
++  {
++    // do_w_x should be int32 or bool type
++    for (auto n : _do_w_x)
++    {
++      if (n.second->dtype() != loco::DataType::BOOL && n.second->dtype() != loco::DataType::S32)
++        return false;
++    }
++
++    return true;
++  }
++
++private:
+   bool do_w_x(luci::CircleConst *node)
+   {
+     const auto prefix = node_name_prefix(node->name());
+ 
+     if (_do_w_x[prefix]->dtype() == loco::DataType::S32)
+       return _do_w_x[prefix]->at<loco::DataType::S32>(0) == 1;
+-    else if (_do_w_x[prefix]->dtype() == loco::DataType::BOOL)
+-      return _do_w_x[prefix]->at<loco::DataType::BOOL>(0);
+     else
+-      throw std::runtime_error("do_w_x should be int or bool");
++      return _do_w_x[prefix]->at<loco::DataType::BOOL>(0);
+   }
+ 
+   luci::CircleConst *get_alpha(luci::CircleConst *node)
+@@ -187,64 +412,6 @@ public:
+     return packed_clusters;
+   }
+ 
+-  /**
+-   * @brief Exclude BCQ information nodes which are used for fusing BCQ operations
+-   *        from graph output by using CircleOutputExclude
+-   */
+-  void clear_BCQ_nodes()
+-  {
+-    auto createNoOp = [](luci::CircleNode *circle_node) {
+-      auto graph = circle_node->graph();
+-      auto noOp = graph->nodes()->create<luci::CircleOutputExclude>();
+-
+-      if (circle_node->shape_status() == luci::ShapeStatus::VALID)
+-      {
+-        noOp->dtype(circle_node->dtype());
+-        noOp->rank(circle_node->rank());
+-        for (uint32_t i = 0; i < circle_node->rank(); ++i)
+-          noOp->dim(i) = circle_node->dim(i);
+-      }
+-      else
+-      {
+-        // For type inference
+-        noOp->dtype(loco::DataType::FLOAT32);
+-      }
+-
+-      return noOp;
+-    };
+-
+-    auto clear_nodes = [createNoOp](std::map<std::string, luci::CircleConst *> &nodes) {
+-      for (auto &n : nodes)
+-      {
+-        auto node = n.second;
+-
+-        for (auto s : loco::succs(node))
+-        {
+-          if (auto outnode = dynamic_cast<luci::CircleOutput *>(s))
+-          {
+-            outnode->from(createNoOp(node));
+-          }
+-          else if (auto reshape_node = dynamic_cast<luci::CircleReshape *>(s))
+-          {
+-            for (auto o : loco::succs(reshape_node))
+-            {
+-              auto circle_output = loco::must_cast<luci::CircleOutput *>(o);
+-              circle_output->from(createNoOp(reshape_node));
+-            }
+-          }
+-        }
+-      }
+-    };
+-
+-    clear_nodes(_do_w_x);
+-    clear_nodes(_alpha);
+-    clear_nodes(_packed_binary_code);
+-    clear_nodes(_number_of_clusters);
+-    clear_nodes(_size_of_clusters);
+-    clear_nodes(_qbits_of_clusters);
+-    clear_nodes(_dequant_weight);
+-  }
+-
+ private:
+   std::map<std::string, luci::CircleConst *> _do_w_x;
+   std::map<std::string, luci::CircleConst *> _alpha;
+@@ -262,142 +429,9 @@ namespace luci
+ 
+ bool FuseBCQPass::run(loco::Graph *g)
+ {
+-  BCQConverter converter;
+-
+   bool changed = false;
+ 
+-  for (auto node : loco::all_nodes(g))
+-  {
+-    if (auto circle_const = dynamic_cast<luci::CircleConst *>(node))
+-    {
+-      converter.add_BCQ_info_node(circle_const);
+-    }
+-  }
+-
+-  for (auto node : loco::active_nodes(loco::output_nodes(g)))
+-  {
+-    if (auto gather = dynamic_cast<luci::CircleGather *>(node))
+-    {
+-      auto params = dynamic_cast<luci::CircleConst *>(gather->params());
+-      if (params != nullptr && converter.has_BCQ_info(params))
+-      {
+-        auto bcq_gather = g->nodes()->create<luci::CircleBCQGather>();
+-
+-        bcq_gather->input_scales(converter.get_alpha(params));
+-        bcq_gather->input_binary(converter.get_packed_binary_code(params));
+-        bcq_gather->indices(gather->indices());
+-        bcq_gather->input_clusters(converter.packed_clusters(params));
+-
+-        const auto binary_hidden_size =
+-            loco::must_cast<luci::CircleConst *>(bcq_gather->input_binary())->dim(1).value() * 32;
+-        bcq_gather->input_hidden_size(binary_hidden_size);
+-
+-        if (converter.do_w_x(params))
+-        {
+-          bcq_gather->axis(gather->axis());
+-        }
+-        else
+-        {
+-          const auto axis_transpose = (gather->axis() == 0) ? 1 : 0;
+-          bcq_gather->axis(axis_transpose);
+-        }
+-
+-        loco::replace(gather).with(bcq_gather);
+-
+-        changed = true;
+-      }
+-    }
+-    else if (auto fully_connected = dynamic_cast<luci::CircleFullyConnected *>(node))
+-    {
+-      auto weights = dynamic_cast<luci::CircleConst *>(fully_connected->weights());
+-      if (weights != nullptr && converter.has_BCQ_info(weights))
+-      {
+-        auto bcq_fc = g->nodes()->create<luci::CircleBCQFullyConnected>();
+-
+-        bcq_fc->weights_scales(converter.get_alpha(weights));
+-        bcq_fc->weights_binary(converter.get_packed_binary_code(weights));
+-        bcq_fc->bias(fully_connected->bias());
+-        bcq_fc->weights_clusters(converter.packed_clusters(weights));
+-        bcq_fc->fusedActivationFunction(fully_connected->fusedActivationFunction());
+-
+-        loco::Node *bcq_input = fully_connected->input();
+-        int32_t batch_rank = 0;
+-
+-        // If input of BCQFullyConnected has more than rank 2, we should reshape it as rank 2
+-        const auto original_input = loco::must_cast<luci::CircleNode *>(fully_connected->input());
+-        if (original_input->shape_status() == ShapeStatus::VALID && original_input->rank() > 2)
+-        {
+-          auto new_shape = g->nodes()->create<luci::CircleConst>();
+-          new_shape->dtype(loco::DataType::S32);
+-          new_shape->size<loco::DataType::S32>(2);
+-          new_shape->rank(1);
+-          new_shape->dim(0) = 2;
+-
+-          auto batch_size = 1;
+-          for (uint32_t i = 0; i < original_input->rank() - 1; ++i)
+-            batch_size *= original_input->dim(i).value();
+-
+-          new_shape->at<loco::DataType::S32>(0) = batch_size;
+-          new_shape->at<loco::DataType::S32>(1) =
+-              original_input->dim(original_input->rank() - 1).value();
+-          new_shape->shape_status(ShapeStatus::VALID);
+-
+-          auto reshape = g->nodes()->create<luci::CircleReshape>();
+-          reshape->tensor(original_input);
+-          reshape->shape(new_shape);
+-
+-          bcq_input = reshape;
+-          batch_rank = original_input->rank() - 2;
+-        }
+-
+-        // If x_w formation, we should insert Transpose in front and back of BCQFullyConnected
+-        if (converter.do_w_x(weights))
+-        {
+-          const auto binary_hidden_size =
+-              loco::must_cast<luci::CircleNode *>(fully_connected->input())
+-                  ->dim(batch_rank)
+-                  .value();
+-          bcq_fc->weights_hidden_size(binary_hidden_size);
+-          bcq_fc->input(bcq_input);
+-          loco::replace(fully_connected).with(bcq_fc);
+-        }
+-        else
+-        {
+-          const auto binary_hidden_size =
+-              loco::must_cast<luci::CircleNode *>(fully_connected->input())
+-                  ->dim(1 + batch_rank)
+-                  .value();
+-          bcq_fc->weights_hidden_size(binary_hidden_size);
+-
+-          auto perm = g->nodes()->create<luci::CircleConst>();
+-          perm->dtype(loco::DataType::S32);
+-          perm->size<loco::DataType::S32>(2);
+-          perm->rank(1);
+-          perm->dim(0) = 2;
+-          perm->at<loco::DataType::S32>(0) = 1;
+-          perm->at<loco::DataType::S32>(1) = 0;
+-          perm->shape_status(ShapeStatus::VALID);
+-
+-          auto input_transpose = g->nodes()->create<luci::CircleTranspose>();
+-          input_transpose->a(bcq_input);
+-          input_transpose->perm(perm);
+-
+-          bcq_fc->input(input_transpose);
+-
+-          auto output_transpose = g->nodes()->create<luci::CircleTranspose>();
+-          output_transpose->a(bcq_fc);
+-          output_transpose->perm(perm);
+-
+-          loco::replace(fully_connected).with(output_transpose);
+-        }
+-
+-        changed = true;
+-      }
+-    }
+-  }
+-
+-  if (changed)
+-    converter.clear_BCQ_nodes();
++  changed = BCQFuser<1>().fuseBCQ(g);
+ 
+   return changed;
+ }
+diff --git a/compiler/luci/pass/src/QuantizationUtils.cpp b/compiler/luci/pass/src/QuantizationUtils.cpp
+index 6726ce7..9c9e741 100644
+--- a/compiler/luci/pass/src/QuantizationUtils.cpp
++++ b/compiler/luci/pass/src/QuantizationUtils.cpp
+@@ -99,6 +99,13 @@ void compute_asym_scale_zp(float min, float max, float &scaling_factor, int64_t
+     nudged_zero_point = static_cast<uint8_t>(std::round(zero_point_double));
+   }
+ 
++  // protect scale from being very low due to overflow
++  if (scale < 1e-5)
++  {
++    scale = 1e-5;
++    nudged_zero_point = static_cast<uint8_t>(std::round(qmin_double - rmin / scale));
++  }
++
+   nudged_min = static_cast<float>((qmin_double - nudged_zero_point) * scale);
+   nudged_max = static_cast<float>((qmax_double - nudged_zero_point) * scale);
+ 
+diff --git a/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp b/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp
+index f8abee7..2264bd7 100644
+--- a/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp
++++ b/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp
+@@ -138,7 +138,8 @@ bool is_quantized(const CircleNode *node)
+          node->dtype() == loco::DataType::S32;  // bias
+ }
+ 
+-void sym_wquant_per_channel(CircleConst *node, std::vector<float> &scaling_factor)
++void sym_wquant_per_channel(CircleConst *node, std::vector<float> &scaling_factor,
++                            int32_t &channel_dim_index)
+ {
+   assert(node->dtype() == loco::DataType::FLOAT32);
+ 
+@@ -153,7 +154,6 @@ void sym_wquant_per_channel(CircleConst *node, std::vector<float> &scaling_facto
+   uint32_t indices[4] = {
+       0,
+   };
+-  int channel_dim_index{0};
+ 
+   if (!get_channel_dim_index(node, dimension, channel_dim_index))
+   {
+@@ -189,7 +189,7 @@ void sym_wquant_per_channel(CircleConst *node, std::vector<float> &scaling_facto
+ }
+ 
+ void asym_wquant_per_channel(CircleConst *node, std::vector<float> &min,
+-                             std::vector<float> &scaling_factor)
++                             std::vector<float> &scaling_factor, int32_t &channel_dim_index)
+ {
+   assert(node->dtype() == loco::DataType::FLOAT32);
+ 
+@@ -204,7 +204,6 @@ void asym_wquant_per_channel(CircleConst *node, std::vector<float> &min,
+   uint32_t indices[4] = {
+       0,
+   };
+-  int channel_dim_index{0};
+ 
+   if (!get_channel_dim_index(node, dimension, channel_dim_index))
+   {
+@@ -350,8 +349,8 @@ struct QuantizeActivation final : public luci::CircleNodeMutableVisitor<bool>
+           circle_node->dtype(loco::DataType::S16);
+         }
+ 
+-        circle_node->quantparam()->max[0] = nudged_max;
+-        circle_node->quantparam()->min[0] = nudged_min;
++        circle_node->quantparam()->min.clear();
++        circle_node->quantparam()->max.clear();
+         circle_node->quantparam()->scale.push_back(scaling_factor);
+         circle_node->quantparam()->zerop.push_back(zp);
+       }
+@@ -472,15 +471,19 @@ struct QuantizeWeights final : public luci::CircleNodeMutableVisitor<bool>
+           assert(quantparam != nullptr);
+           auto min = quantparam->min;
+           auto scaling_factor = quantparam->scale;
++          int32_t channel_dim_index = 0;
+ 
+           if (output_type == loco::DataType::U8)
+           {
+-            asym_wquant_per_channel(circle_const, min, scaling_factor);
++            asym_wquant_per_channel(circle_const, min, scaling_factor, channel_dim_index);
+           }
+           else
+           {
+-            sym_wquant_per_channel(circle_const, scaling_factor);
++            sym_wquant_per_channel(circle_const, scaling_factor, channel_dim_index);
+           }
++          quantparam->min.clear();
++          quantparam->max.clear();
++          quantparam->quantized_dimension = channel_dim_index;
+         }
+         // Find min/max per layer-wise
+         else
+@@ -493,6 +496,8 @@ struct QuantizeWeights final : public luci::CircleNodeMutableVisitor<bool>
+           auto min = quantparam->min[0];
+           auto scaling_factor = quantparam->scale[0];
+           asym_wquant_per_layer(circle_const, min, scaling_factor);
++          quantparam->min.clear();
++          quantparam->max.clear();
+         }
+       }
+     }
+diff --git a/compiler/luci/tests/test.lst b/compiler/luci/tests/test.lst
+index 188e298..3da3437 100644
+--- a/compiler/luci/tests/test.lst
++++ b/compiler/luci/tests/test.lst
+@@ -30,13 +30,16 @@ addread(Ceil_000)
+ addread(Concatenation_000)
+ addread(Concatenation_U8_000)
+ addread(Conv2D_000)
++addread(Conv2D_001)
+ addread(Conv2D_002)
+ addread(Conv2D_003)
+ addread(Conv2D_U8_000)
++addread(Conv2D_U8_001)
+ addread(Cos_000)
+ addread(DepthToSpace_000)
+ addread(DepthwiseConv2D_000)
+ addread(DepthwiseConv2D_U8_000)
++addread(DepthwiseConv2D_U8_001)
+ addread(DepthwiseConv2D_001)
+ addread(Div_000)
+ addread(ELU_000)
+@@ -84,6 +87,7 @@ addread(MaxPool2D_000)
+ addread(MaxPool2D_U8_000)
+ addread(Mean_000)
+ addread(Mean_001)
++addread(Mean_U8_000)
+ addread(Minimum_000)
+ addread(MirrorPad_000)
+ addread(Mul_000)
+@@ -97,6 +101,7 @@ addread(OneHot_003)
+ addread(Pack_000)
+ addread(Pack_U8_000)
+ addread(Pad_000)
++addread(Pad_U8_000)
+ addread(Pow_000)
+ addread(PRelu_000)
+ addread(Range_000)
+@@ -222,13 +227,16 @@ addwrite(Ceil_000)
+ addwrite(Concatenation_000)
+ addwrite(Concatenation_U8_000)
+ addwrite(Conv2D_000)
++addwrite(Conv2D_001)
+ addwrite(Conv2D_002)
+ addwrite(Conv2D_003)
+ addwrite(Conv2D_U8_000)
++addwrite(Conv2D_U8_001)
+ addwrite(Cos_000)
+ addwrite(DepthToSpace_000)
+ addwrite(DepthwiseConv2D_000)
+ addwrite(DepthwiseConv2D_U8_000)
++addwrite(DepthwiseConv2D_U8_001)
+ addwrite(DepthwiseConv2D_001)
+ addwrite(Div_000)
+ addwrite(ELU_000)
+@@ -276,6 +284,7 @@ addwrite(MaxPool2D_000)
+ addwrite(MaxPool2D_U8_000)
+ addwrite(Mean_000)
+ addwrite(Mean_001)
++addwrite(Mean_U8_000)
+ addwrite(Minimum_000)
+ addwrite(MirrorPad_000)
+ addwrite(Mul_000)
+diff --git a/compiler/one-cmds/one-codegen b/compiler/one-cmds/one-codegen
+index 2c80664..820b6d8 100644
+--- a/compiler/one-cmds/one-codegen
++++ b/compiler/one-cmds/one-codegen
+@@ -18,7 +18,7 @@ DRIVER_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ 
+ function Usage()
+ {
+-  echo "Usage: $0 [BACKEND] ..."
++  echo "Usage: one-codegen [BACKEND] ..."
+   echo "Available BACKEND drivers:"
+   backend_exist=0
+   for file in `find $DRIVER_PATH -name *-compile -type f`;
+@@ -33,23 +33,34 @@ function Usage()
+   if [ $backend_exist == 0 ]; then
+     echo "  (There is no available backend drivers)"
+   fi
++
++  exit 255
+ }
+ 
+-# Get command from command-line
+-BACKEND=$1; shift
+-BACKEND_DRIVER="$BACKEND-compile"
++function version()
++{
++  $DRIVER_PATH/one-version one-codegen
++  exit 255
++}
+ 
+-if [[ -z "${BACKEND_DRIVER}" ]]; then
++# Get command from command-line
++BACKEND=$1
++if [[ -z ${BACKEND} ]]; then
+   Usage
+-  exit 255
+ fi
++shift
++
++if [[ "${BACKEND}" == "--version" ]]; then
++  version
++fi
++
++BACKEND_DRIVER="${BACKEND}-compile"
+ 
+ BACKEND_DRIVER_CMD="${DRIVER_PATH}/${BACKEND_DRIVER}"
+ 
+ if [[ ! -f "${BACKEND_DRIVER_CMD}" ]]; then
+   echo "ERROR: '${BACKEND_DRIVER}' is not supported"
+   Usage
+-  exit 255
+ fi
+ 
+ "${BACKEND_DRIVER_CMD}" "$@"
+diff --git a/compiler/one-cmds/one-import b/compiler/one-cmds/one-import
+index dbf4af5..b1dd8f4 100644
+--- a/compiler/one-cmds/one-import
++++ b/compiler/one-cmds/one-import
+@@ -18,7 +18,7 @@ DRIVER_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ 
+ function Usage()
+ {
+-  echo "Usage: $0 [FRAMEWORK] ..."
++  echo "Usage: one-import [FRAMEWORK] ..."
+   echo "Available FRAMEWORK drivers:"
+   framework_exist=0
+   for file in "$DRIVER_PATH"/one-import-*;
+@@ -31,23 +31,34 @@ function Usage()
+   if [ $framework_exist == 0 ]; then
+     echo "  (There is no available import drivers)"
+   fi
++
++  exit 255
+ }
+ 
+-# Get command from command-line
+-FRAMEWORK=$1; shift
+-FRAMEWORK_DRIVER="one-import-$FRAMEWORK"
++function version()
++{
++  $DRIVER_PATH/one-version one-import-tf
++  exit 255
++}
+ 
+-if [[ -z "${FRAMEWORK_DRIVER}" ]]; then
++# Get command from command-line
++FRAMEWORK=$1
++if [[ -z ${FRAMEWORK} ]]; then
+   Usage
+-  exit 255
++fi
++shift
++
++if [ ${FRAMEWORK} = "--version" ]; then
++  version
+ fi
+ 
++FRAMEWORK_DRIVER="one-import-$FRAMEWORK"
++
+ FRAMEWORK_DRIVER_CMD="${DRIVER_PATH}/${FRAMEWORK_DRIVER}"
+ 
+ if [[ ! -f "${FRAMEWORK_DRIVER_CMD}" ]]; then
+   echo "ERROR: '${FRAMEWORK_DRIVER}' is not supported"
+   Usage
+-  exit 255
+ fi
+ 
+ "${FRAMEWORK_DRIVER_CMD}" "$@"
+diff --git a/compiler/one-cmds/one-import-tf b/compiler/one-cmds/one-import-tf
+index c048a4e..d59e1c5 100644
+--- a/compiler/one-cmds/one-import-tf
++++ b/compiler/one-cmds/one-import-tf
+@@ -22,14 +22,24 @@ usage()
+ {
+   echo "Convert TensorFlow model to circle."
+   echo "Usage: one-import-tf"
++  echo "    --version Show version information and exit"
+   echo "    --input_path <path/to/tfmodel>"
+   echo "    --output_path <path/to/circle>"
+   echo "    --input_arrays <names of the input arrays, comma-separated>"
+   echo "    --input_shapes <input shapes, colon-separated>"
+   echo "    --output_arrays <names of the output arrays, comma-separated>"
+-  exit 0
++  echo "    --v2 Use TensorFlow 2.x interface (default is 1.x interface)"
++  exit 255
+ }
+ 
++version()
++{
++  $DRIVER_PATH/one-version one-import-tf
++  exit 255
++}
++
++TF_INTERFACE="--v1"
++
+ # Parse command-line arguments
+ #
+ while [ "$#" -ne 0 ]; do
+@@ -39,6 +49,9 @@ while [ "$#" -ne 0 ]; do
+     '--help')
+       usage
+       ;;
++    '--version')
++      version
++      ;;
+     '--input_path')
+       export INPUT_PATH="$2"
+       shift 2
+@@ -59,6 +72,10 @@ while [ "$#" -ne 0 ]; do
+       export OUTPUT_ARRAYS="$2"
+       shift 2
+       ;;
++    '--v2')
++      TF_INTERFACE="--v2"
++      shift
++      ;;
+     *)
+       echo "Unknown parameter: ${CUR}"
+       shift
+@@ -92,14 +109,21 @@ fi
+ # remove previous log
+ rm -rf "${OUTPUT_PATH}.log"
+ 
++show_err_onexit()
++{
++  cat "${OUTPUT_PATH}.log"
++}
++
++trap show_err_onexit ERR
++
+ # generate temporary tflite file
+-echo "python" "${DRIVER_PATH}/tf2tfliteV2.py" --v2 --input_path ${INPUT_PATH} \
++echo "python" "${DRIVER_PATH}/tf2tfliteV2.py" ${TF_INTERFACE} --input_path ${INPUT_PATH} \
+ --input_arrays ${INPUT_ARRAYS} --input_shapes ${INPUT_SHAPES} \
+ --output_path "${TMPDIR}/${MODEL_NAME}.tflite" \
+ --output_arrays ${OUTPUT_ARRAYS} > "${OUTPUT_PATH}.log"
+ echo " " >> "${OUTPUT_PATH}.log"
+ 
+-python "${DRIVER_PATH}/tf2tfliteV2.py" --v2 --input_path ${INPUT_PATH} \
++python "${DRIVER_PATH}/tf2tfliteV2.py" ${TF_INTERFACE} --input_path ${INPUT_PATH} \
+ --input_arrays ${INPUT_ARRAYS} --input_shapes ${INPUT_SHAPES} \
+ --output_path "${TMPDIR}/${MODEL_NAME}.tflite" \
+ --output_arrays ${OUTPUT_ARRAYS} >> "${OUTPUT_PATH}.log" 2>&1
+diff --git a/compiler/one-cmds/one-import-tflite b/compiler/one-cmds/one-import-tflite
+index 31ed5af..053489c 100644
+--- a/compiler/one-cmds/one-import-tflite
++++ b/compiler/one-cmds/one-import-tflite
+@@ -22,9 +22,16 @@ usage()
+ {
+   echo "Convert TensorFlow lite model to circle."
+   echo "Usage: one-import-tflite"
++  echo "    --version Show version information and exit"
+   echo "    --input_path <path/to/tflitemodel>"
+   echo "    --output_path <path/to/circle>"
+-  exit 0
++  exit 255
++}
++
++version()
++{
++  $DRIVER_PATH/one-version one-import-tflite
++  exit 255
+ }
+ 
+ # Parse command-line arguments
+@@ -36,6 +43,9 @@ while [ "$#" -ne 0 ]; do
+     '--help')
+       usage
+       ;;
++    '--version')
++      version
++      ;;
+     '--input_path')
+       export INPUT_PATH="$2"
+       shift 2
+@@ -55,12 +65,18 @@ if [ -z ${INPUT_PATH} ] || [ ! -e ${INPUT_PATH} ]; then
+   echo "Error: input model not found"
+   echo ""
+   usage
+-  exit 2
+ fi
+ 
+ # remove previous log
+ rm -rf "${OUTPUT_PATH}.log"
+ 
++show_err_onexit()
++{
++  cat "${OUTPUT_PATH}.log"
++}
++
++trap show_err_onexit ERR
++
+ # convert .tflite to .circle
+ echo "${DRIVER_PATH}/tflite2circle" "${INPUT_PATH}" "${OUTPUT_PATH}" > "${OUTPUT_PATH}.log"
+ 
+diff --git a/compiler/one-cmds/one-optimize b/compiler/one-cmds/one-optimize
+index 95384c1..17b6b98 100644
+--- a/compiler/one-cmds/one-optimize
++++ b/compiler/one-cmds/one-optimize
+@@ -22,6 +22,7 @@ usage()
+ {
+   echo "Optimize circle model."
+   echo "Usage: one-optimize"
++  echo "    --version       Show version information and exit"
+   echo "    --all           Enable all optimization algorithms"
+   echo "    --fuse_bcq      Enable FuseBCQ Pass"
+   echo "    --fuse_instnorm Enable FuseInstanceNormalization Pass"
+@@ -33,7 +34,13 @@ usage()
+   echo "                    Enable ResolveCustomOpMatMulPass Pass"
+   echo "    --input_path <path/to/input/circle>"
+   echo "    --output_path <path/to/output/circle>"
+-  exit 0
++  exit 255
++}
++
++version()
++{
++  $DRIVER_PATH/one-version one-optimize
++  exit 255
+ }
+ 
+ OPTIMIZE_all=0
+@@ -52,6 +59,9 @@ while [ "$#" -ne 0 ]; do
+     '--help')
+       usage
+       ;;
++    '--version')
++      version
++      ;;
+     '--all')
+       OPTIMIZE_all=1
+       shift
+@@ -96,7 +106,6 @@ if [ -z ${INPUT_PATH} ] || [ ! -e ${INPUT_PATH} ]; then
+   echo "Error: input model not found"
+   echo ""
+   usage
+-  exit 2
+ fi
+ 
+ OPTIMIZE_OPTIONS=""
+@@ -123,6 +132,13 @@ fi
+ # remove previous log
+ rm -rf "${OUTPUT_PATH}.log"
+ 
++show_err_onexit()
++{
++  cat "${OUTPUT_PATH}.log"
++}
++
++trap show_err_onexit ERR
++
+ # NOTE do not wrap ${OPTIMIZE_OPTIONS} with ""
+ # optimize circle
+ echo "${DRIVER_PATH}/circle2circle" ${OPTIMIZE_OPTIONS} \
+diff --git a/compiler/one-cmds/one-pack b/compiler/one-cmds/one-pack
+index 2bc4c60..9224b2c 100644
+--- a/compiler/one-cmds/one-pack
++++ b/compiler/one-cmds/one-pack
+@@ -22,9 +22,16 @@ usage()
+ {
+   echo "Package circle to nnpkg"
+   echo "Usage: one-pack"
++  echo "    -v, --version Show version information and exit"
+   echo "    -i <path/to/circle>"
+   echo "    -o <path/to/nnpackage/folder>"
+-  exit 0
++  exit 255
++}
++
++version()
++{
++  $DRIVER_PATH/one-version one-pack
++  exit 255
+ }
+ 
+ # Parse command-line arguments
+@@ -36,6 +43,12 @@ while [ "$#" -ne 0 ]; do
+     '--help')
+       usage
+       ;;
++    '-v')
++      version
++      ;;
++    '--version')
++      version
++      ;;
+     '-i')
+       export INPUT_PATH="$2"
+       shift 2
+@@ -55,12 +68,18 @@ if [ -z ${INPUT_PATH} ] || [ ! -e ${INPUT_PATH} ]; then
+   echo "Error: input model not found"
+   echo ""
+   usage
+-  exit 2
+ fi
+ 
+ # remove previous log
+ rm -rf "${OUTPUT_PATH}.log"
+ 
++show_err_onexit()
++{
++  cat "${OUTPUT_PATH}.log"
++}
++
++trap show_err_onexit ERR
++
+ # Package circle model file to nnpkg
+ echo "${DRIVER_PATH}/model2nnpkg.sh" -o "${OUTPUT_PATH}" "${INPUT_PATH}" > "${OUTPUT_PATH}.log"
+ 
+diff --git a/compiler/one-cmds/one-quantize b/compiler/one-cmds/one-quantize
+index ff9e266..c74b2c2 100644
+--- a/compiler/one-cmds/one-quantize
++++ b/compiler/one-cmds/one-quantize
+@@ -22,16 +22,23 @@ usage()
+ {
+   echo "Quantize circle model."
+   echo "Usage: one-quantize"
++  echo "    --version         Show version information and exit"
+   echo "    --input_dtype     Input data type (supported: float32, default=float32)"
+   echo "    --quantized_dtype Output quantized data type (supported: uint8, default=uint8)"
+-  echo "    --granularity     Quantize granularity (supported: layer, default=layer)"
++  echo "    --granularity     Quantize granularity (supported: layer, channel, default=layer)"
+   echo "    --min_percentile  Minimum percentile (0.0~100.0, default=1.0)"
+   echo "    --max_percentile  Maximum percentile (0.0~100.0, default=99.0)"
+   echo "    --mode            Record mode (supported: percentile/moving_average, default=percentile)"
+   echo "    --input_path <path/to/input/circle>"
+   echo "    --input_data <path/to/input/data>"
+   echo "    --output_path <path/to/output/circle>"
+-  exit 0
++  exit 255
++}
++
++version()
++{
++  $DRIVER_PATH/one-version one-quantize
++  exit 255
+ }
+ 
+ INPUT_DTYPE=float32
+@@ -50,6 +57,9 @@ while [ "$#" -ne 0 ]; do
+     '--help')
+       usage
+       ;;
++    '--version')
++      version
++      ;;
+ 
+     '--input_dtype')
+       INPUT_DTYPE="$2"
+@@ -100,13 +110,11 @@ if [ -z ${INPUT_PATH} ] || [ ! -e ${INPUT_PATH} ]; then
+   echo "Error: input model not found"
+   echo ""
+   usage
+-  exit 2
+ fi
+ if [ -z ${INPUT_DATA} ] || [ ! -e ${INPUT_DATA} ]; then
+   echo "Error: input data not found"
+   echo ""
+   usage
+-  exit 2
+ fi
+ 
+ FILE_BASE=$(basename ${OUTPUT_PATH})
+@@ -118,6 +126,13 @@ trap "{ rm -rf $TMPDIR; }" EXIT
+ # remove previous log
+ rm -rf "${OUTPUT_PATH}.log"
+ 
++show_err_onexit()
++{
++  cat "${OUTPUT_PATH}.log"
++}
++
++trap show_err_onexit ERR
++
+ # quantize circle
+ echo "${DRIVER_PATH}/circle-quantizer" \
+ --quantize_dequantize_weights ${INPUT_DTYPE} ${QUANTIZED_DTYPE} ${GRANULARITY} \
+diff --git a/compiler/one-cmds/requires.cmake b/compiler/one-cmds/requires.cmake
+index 9b858ad..812149c 100644
+--- a/compiler/one-cmds/requires.cmake
++++ b/compiler/one-cmds/requires.cmake
+@@ -3,3 +3,4 @@ require("tflite2circle")
+ require("circle2circle")
+ require("circle-quantizer")
+ require("record-minmax")
++require("vconone")
+diff --git a/compiler/record-minmax/CMakeLists.txt b/compiler/record-minmax/CMakeLists.txt
+index 862660e..f8a165b 100644
+--- a/compiler/record-minmax/CMakeLists.txt
++++ b/compiler/record-minmax/CMakeLists.txt
+@@ -19,9 +19,14 @@ target_link_libraries(record-minmax safemain)
+ target_link_libraries(record-minmax luci_import)
+ target_link_libraries(record-minmax luci_export)
+ target_link_libraries(record-minmax luci_interpreter)
++target_link_libraries(record-minmax vconone)
+ 
+ install(TARGETS record-minmax DESTINATION bin)
+ 
++if(NOT ENABLE_TEST)
++  return()
++endif(NOT ENABLE_TEST)
++
+ nnas_find_package(GTest REQUIRED)
+ GTest_AddTest(record_minmax_function_test "${CMAKE_CURRENT_SOURCE_DIR}/tests/RecordFunction.test.cpp")
+ target_include_directories(record_minmax_function_test PRIVATE include)
+diff --git a/compiler/record-minmax/driver/Driver.cpp b/compiler/record-minmax/driver/Driver.cpp
+index ae4fcb7..8b09498 100644
+--- a/compiler/record-minmax/driver/Driver.cpp
++++ b/compiler/record-minmax/driver/Driver.cpp
+@@ -17,6 +17,13 @@
+ #include "RecordMinMax.h"
+ 
+ #include <arser/arser.h>
++#include <vconone/vconone.h>
++
++void print_version(void)
++{
++  std::cout << "record-minmax version " << vconone::get_string() << std::endl;
++  std::cout << vconone::get_copyright() << std::endl;
++}
+ 
+ int entry(const int argc, char **argv)
+ {
+@@ -25,6 +32,13 @@ int entry(const int argc, char **argv)
+   arser::Arser arser(
+       "Embedding min/max values of activations to the circle model for post-training quantization");
+ 
++  arser.add_argument("--version")
++      .nargs(0)
++      .required(false)
++      .default_value(false)
++      .help("Show version information and exit")
++      .exit_with(print_version);
++
+   arser.add_argument("--input_model")
+       .nargs(1)
+       .type(arser::DataType::STR)
+@@ -66,7 +80,7 @@ int entry(const int argc, char **argv)
+   {
+     std::cout << err.what() << std::endl;
+     std::cout << arser;
+-    return 0;
++    return 255;
+   }
+ 
+   auto input_model_path = arser.get<std::string>("--input_model");
+diff --git a/compiler/record-minmax/requires.cmake b/compiler/record-minmax/requires.cmake
+index 0545035..f6804ce 100644
+--- a/compiler/record-minmax/requires.cmake
++++ b/compiler/record-minmax/requires.cmake
+@@ -1,3 +1,4 @@
+ require("luci")
+ require("safemain")
+ require("arser")
++require("vconone")
+diff --git a/compiler/record-minmax/src/HDF5Importer.cpp b/compiler/record-minmax/src/HDF5Importer.cpp
+index cf30cd8..a0e65ee 100644
+--- a/compiler/record-minmax/src/HDF5Importer.cpp
++++ b/compiler/record-minmax/src/HDF5Importer.cpp
+@@ -20,6 +20,7 @@
+ 
+ #include <string>
+ #include <cassert>
++#include <stdexcept>
+ 
+ using Shape = luci_interpreter::Shape;
+ using DataType = luci_interpreter::DataType;
+diff --git a/compiler/record-minmax/src/MinMaxObserver.cpp b/compiler/record-minmax/src/MinMaxObserver.cpp
+index 45f0197..410ce3d 100644
+--- a/compiler/record-minmax/src/MinMaxObserver.cpp
++++ b/compiler/record-minmax/src/MinMaxObserver.cpp
+@@ -38,7 +38,8 @@ void MinMaxObserver::postTensorWrite(const luci::CircleNode *node,
+   assert(node->opcode() != luci::CircleOpcode::UNPACK);
+   assert(node->opcode() != luci::CircleOpcode::WHILE);
+ 
+-  if (node->opcode() == luci::CircleOpcode::CONST)
++  if (node->opcode() == luci::CircleOpcode::CONST ||
++      node->opcode() == luci::CircleOpcode::CIRCLECONST)
+   {
+     // node is not activation. Do nothing.
+     return;
+diff --git a/compiler/record-minmax/src/RecordMinMax.cpp b/compiler/record-minmax/src/RecordMinMax.cpp
+index d12a0d3..17c6aa6 100644
+--- a/compiler/record-minmax/src/RecordMinMax.cpp
++++ b/compiler/record-minmax/src/RecordMinMax.cpp
+@@ -158,7 +158,7 @@ void RecordMinMax::profileData(const std::string &mode, const std::string &input
+     auto node = iter->first;
+     auto minmax = iter->second;
+ 
+-    float min, max;
++    float min{0.0f}, max{0.0f};
+     if (mode == "percentile")
+     {
+       min = getNthPercentile(minmax.min_vector, min_percentile);
+diff --git a/compiler/record-minmax/tests/RecordFunction.test.cpp b/compiler/record-minmax/tests/RecordFunction.test.cpp
+index 13b464d..e2f135a 100644
+--- a/compiler/record-minmax/tests/RecordFunction.test.cpp
++++ b/compiler/record-minmax/tests/RecordFunction.test.cpp
+@@ -32,6 +32,8 @@ TEST(GetNthPercentileTest, Edge)
+ 
+   EXPECT_FLOAT_NEAR(0, getNthPercentile(input, 0));
+   EXPECT_FLOAT_NEAR(9, getNthPercentile(input, 100));
++
++  SUCCEED();
+ }
+ 
+ TEST(GetNthPercentileTest, Simple)
+@@ -47,6 +49,8 @@ TEST(GetNthPercentileTest, Simple)
+   {
+     EXPECT_FLOAT_NEAR(0.09 * std::floor(i) + 0.045, getNthPercentile(input, i));
+   }
++
++  SUCCEED();
+ }
+ 
+ TEST(GetNthPercentileTest, Float)
+@@ -61,6 +65,8 @@ TEST(GetNthPercentileTest, Float)
+   EXPECT_FLOAT_NEAR(2.799942346802177, getNthPercentile(input, 1));
+   EXPECT_FLOAT_NEAR(7.768503955476342, getNthPercentile(input, 3.14));
+   EXPECT_FLOAT_NEAR(99.40456084968194, getNthPercentile(input, 99));
++
++  SUCCEED();
+ }
+ 
+ TEST(GetNthPercentileTest, FloatWithNegative)
+@@ -75,6 +81,8 @@ TEST(GetNthPercentileTest, FloatWithNegative)
+   EXPECT_FLOAT_NEAR(-47.20005765319782, getNthPercentile(input, 1));
+   EXPECT_FLOAT_NEAR(-42.23149604452366, getNthPercentile(input, 3.14));
+   EXPECT_FLOAT_NEAR(49.40456084968194, getNthPercentile(input, 99));
++
++  SUCCEED();
+ }
+ 
+ TEST(GetNthPercentileTest, SigleElement)
+@@ -84,6 +92,8 @@ TEST(GetNthPercentileTest, SigleElement)
+   EXPECT_FLOAT_NEAR(33, getNthPercentile(input, 0));
+   EXPECT_FLOAT_NEAR(33, getNthPercentile(input, 50));
+   EXPECT_FLOAT_NEAR(33, getNthPercentile(input, 100));
++
++  SUCCEED();
+ }
+ 
+ TEST(GetNthPercentileTest, OutOfBoundary_NEG)
+@@ -92,6 +102,8 @@ TEST(GetNthPercentileTest, OutOfBoundary_NEG)
+ 
+   EXPECT_THROW(getNthPercentile(input, -1), std::runtime_error);
+   EXPECT_THROW(getNthPercentile(input, 101), std::runtime_error);
++
++  SUCCEED();
+ }
+ 
+ TEST(GetNthPercentileTest, EmptyVector_NEG)
+@@ -99,6 +111,8 @@ TEST(GetNthPercentileTest, EmptyVector_NEG)
+   std::vector<float> input;
+ 
+   EXPECT_THROW(getNthPercentile(input, 10), std::runtime_error);
++
++  SUCCEED();
+ }
+ 
+ } // namespace record_minmax
+diff --git a/compiler/tfl-verify/CMakeLists.txt b/compiler/tfl-verify/CMakeLists.txt
+index d33059f..4421a46 100644
+--- a/compiler/tfl-verify/CMakeLists.txt
++++ b/compiler/tfl-verify/CMakeLists.txt
+@@ -6,6 +6,7 @@ file(GLOB_RECURSE SOURCES "src/*.cpp")
+ 
+ add_executable(tfl-verify ${SOURCES})
+ target_include_directories(tfl-verify PRIVATE src)
++target_link_libraries(tfl-verify arser)
+ target_link_libraries(tfl-verify foder)
+ target_link_libraries(tfl-verify mio_tflite)
+ target_link_libraries(tfl-verify safemain)
+diff --git a/compiler/tfl-verify/requires.cmake b/compiler/tfl-verify/requires.cmake
+index ed6b84d..79503f3 100644
+--- a/compiler/tfl-verify/requires.cmake
++++ b/compiler/tfl-verify/requires.cmake
+@@ -1,3 +1,4 @@
++require("arser")
+ require("foder")
+ require("mio-tflite")
+ require("safemain")
+diff --git a/compiler/tfl-verify/src/Driver.cpp b/compiler/tfl-verify/src/Driver.cpp
+index 81f6d54..6d18976 100644
+--- a/compiler/tfl-verify/src/Driver.cpp
++++ b/compiler/tfl-verify/src/Driver.cpp
+@@ -16,22 +16,31 @@
+ 
+ #include "VerifyFlatBuffers.h"
+ 
++#include <arser/arser.h>
++
+ #include <iostream>
+ #include <memory>
+ #include <string>
+ 
+ int entry(int argc, char **argv)
+ {
+-  if (argc != 2)
++  arser::Arser arser;
++  arser.add_argument("tflite").type(arser::DataType::STR).help("TFLite file path to verify");
++
++  try
+   {
+-    std::cerr << "ERROR: Failed to parse arguments" << std::endl;
+-    std::cerr << std::endl;
+-    std::cerr << "USAGE: " << argv[0] << " [tflite]" << std::endl;
++    arser.parse(argc, argv);
++  }
++  catch (const std::runtime_error &err)
++  {
++    std::cout << err.what() << std::endl;
++    std::cout << arser;
+     return 255;
+   }
++
+   auto verifier = std::make_unique<VerifyFlatbuffers>();
+ 
+-  std::string model_file = argv[argc - 1];
++  std::string model_file = arser.get<std::string>("tflite");
+ 
+   std::cout << "[ RUN       ] Check " << model_file << std::endl;
+ 
+diff --git a/compiler/tflchef/core/src/ModelChef.cpp b/compiler/tflchef/core/src/ModelChef.cpp
+index 932a649..692ce48 100644
+--- a/compiler/tflchef/core/src/ModelChef.cpp
++++ b/compiler/tflchef/core/src/ModelChef.cpp
+@@ -413,6 +413,7 @@ template <typename T> void cook_graph(const T &graph, CookParams &cp)
+       quant_builder.add_min(quant_min);
+       quant_builder.add_scale(quant_scale);
+       quant_builder.add_zero_point(quant_zero_point);
++      quant_builder.add_quantized_dimension(quant.quantized_dimension());
+ 
+       // Update QuantizationParameters Index
+       quant_index = quant_builder.Finish();
+diff --git a/compiler/tflchef/proto/tflchef.proto b/compiler/tflchef/proto/tflchef.proto
+index 792503b..55785c3 100644
+--- a/compiler/tflchef/proto/tflchef.proto
++++ b/compiler/tflchef/proto/tflchef.proto
+@@ -35,6 +35,7 @@ message TensorQuantization {
+   repeated float max = 2;
+   repeated float scale = 3;
+   repeated int64 zero_point = 4;
++  optional int32 quantized_dimension = 5 [default = 0];
+ }
+ 
+ message Operand {
+diff --git a/compiler/tflchef/tflite/src/RecipeChef.cpp b/compiler/tflchef/tflite/src/RecipeChef.cpp
+index db62d0e..088961c 100644
+--- a/compiler/tflchef/tflite/src/RecipeChef.cpp
++++ b/compiler/tflchef/tflite/src/RecipeChef.cpp
+@@ -184,6 +184,8 @@ std::unique_ptr<ModelRecipe> generate_recipe(const tflite::Model *model)
+         for (uint32_t idx = 0; idx < quant->zero_point()->size(); ++idx)
+           chef_quant->add_zero_point(quant->zero_point()->Get(idx));
+       }
++      tflchef::TensorQuantization *chef_quant = operand->mutable_quant();
++      chef_quant->set_quantized_dimension(quant->quantized_dimension());
+     }
+   }
+ 
+diff --git a/compiler/tflchef/tools/file/Driver.cpp b/compiler/tflchef/tools/file/Driver.cpp
+index cecfeeb..46e5b55 100644
+--- a/compiler/tflchef/tools/file/Driver.cpp
++++ b/compiler/tflchef/tools/file/Driver.cpp
+@@ -41,7 +41,7 @@ int entry(int argc, char **argv)
+   {
+     std::cout << err.what() << std::endl;
+     std::cout << arser;
+-    return 0;
++    return 255;
+   }
+ 
+   int32_t model_version = 1;
+diff --git a/compiler/tflchef/tools/reverse/Driver.cpp b/compiler/tflchef/tools/reverse/Driver.cpp
+index 1116dec..4d795a3 100644
+--- a/compiler/tflchef/tools/reverse/Driver.cpp
++++ b/compiler/tflchef/tools/reverse/Driver.cpp
+@@ -38,7 +38,7 @@ int entry(int argc, char **argv)
+   {
+     std::cout << err.what() << std::endl;
+     std::cout << arser;
+-    return 0;
++    return 255;
+   }
+ 
+   std::string tflite_path = arser.get<std::string>("tflite");
+diff --git a/compiler/tfldump/driver/Driver.cpp b/compiler/tfldump/driver/Driver.cpp
+index 3961d2f..38c9c06 100644
+--- a/compiler/tfldump/driver/Driver.cpp
++++ b/compiler/tfldump/driver/Driver.cpp
+@@ -33,7 +33,7 @@ int entry(int argc, char **argv)
+   {
+     std::cout << err.what() << '\n';
+     std::cout << arser;
+-    return 0;
++    return 255;
+   }
+ 
+   std::string tflite_path = arser.get<std::string>("tflite");
+diff --git a/compiler/tflite2circle/CMakeLists.txt b/compiler/tflite2circle/CMakeLists.txt
+index a0a2e02..b1d1f61 100644
+--- a/compiler/tflite2circle/CMakeLists.txt
++++ b/compiler/tflite2circle/CMakeLists.txt
+@@ -14,5 +14,6 @@ target_link_libraries(tflite2circle arser)
+ target_link_libraries(tflite2circle safemain)
+ target_link_libraries(tflite2circle mio_tflite)
+ target_link_libraries(tflite2circle mio_circle)
++target_link_libraries(tflite2circle vconone)
+ 
+ install(TARGETS tflite2circle DESTINATION bin)
+diff --git a/compiler/tflite2circle/driver/Driver.cpp b/compiler/tflite2circle/driver/Driver.cpp
+index 67b8e33..2f11e0a 100644
+--- a/compiler/tflite2circle/driver/Driver.cpp
++++ b/compiler/tflite2circle/driver/Driver.cpp
+@@ -24,10 +24,25 @@
+ #include "CircleModel.h"
+ #include "TFLModel.h"
+ 
++#include <vconone/vconone.h>
++
++void print_version(void)
++{
++  std::cout << "tflite2circle version " << vconone::get_string() << std::endl;
++  std::cout << vconone::get_copyright() << std::endl;
++}
++
+ int entry(int argc, char **argv)
+ {
+   arser::Arser arser{"tflite2circle is a Tensorflow lite to circle model converter"};
+ 
++  arser.add_argument("--version")
++      .nargs(0)
++      .required(false)
++      .default_value(false)
++      .help("Show version information and exit")
++      .exit_with(print_version);
++
+   arser.add_argument("tflite")
+       .nargs(1)
+       .type(arser::DataType::STR)
+@@ -42,7 +57,7 @@ int entry(int argc, char **argv)
+   {
+     std::cout << err.what() << std::endl;
+     std::cout << arser;
+-    return 0;
++    return 255;
+   }
+ 
+   std::string tfl_path = arser.get<std::string>("tflite");
+diff --git a/compiler/tflite2circle/requires.cmake b/compiler/tflite2circle/requires.cmake
+index ff19b74..837c287 100644
+--- a/compiler/tflite2circle/requires.cmake
++++ b/compiler/tflite2circle/requires.cmake
+@@ -2,3 +2,4 @@ require("arser")
+ require("mio-tflite")
+ require("mio-circle")
+ require("safemain")
++require("vconone")
+diff --git a/compiler/vconone/CMakeLists.txt b/compiler/vconone/CMakeLists.txt
+new file mode 100644
+index 0000000..b8cb793
+--- /dev/null
++++ b/compiler/vconone/CMakeLists.txt
+@@ -0,0 +1,31 @@
++if (NOT VCONONE_VERSION)
++  set(VCONONE_VERSION 0x0000000000080001)
++  # NOTE order is [build patch minor major]
++  # if VCONONE_VERSION is set with -D option, it will be cached
++  # you may have to remove cache file if you remove -D option
++endif()
++
++configure_file(version_cfg.h.in version_cfg.h @ONLY)
++
++set(DRIVER "driver/driver.cpp")
++
++file(GLOB_RECURSE SOURCES "src/*.cpp")
++file(GLOB_RECURSE TESTS "src/*.test.cpp")
++list(REMOVE_ITEM SOURCES ${TESTS})
++
++add_library(vconone STATIC ${SOURCES})
++target_include_directories(vconone PUBLIC include)
++target_include_directories(vconone PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
++
++add_executable(one-version ${DRIVER})
++target_link_libraries(one-version vconone)
++install(TARGETS one-version DESTINATION bin)
++
++if(NOT ENABLE_TEST)
++  return()
++endif(NOT ENABLE_TEST)
++
++nnas_find_package(GTest REQUIRED)
++
++GTest_AddTest(vconone_test ${TESTS})
++target_link_libraries(vconone_test vconone)
+diff --git a/compiler/vconone/README.md b/compiler/vconone/README.md
+new file mode 100644
+index 0000000..c08dd63
+--- /dev/null
++++ b/compiler/vconone/README.md
+@@ -0,0 +1,14 @@
++# vconone
++
++_vconone_ provides version number and strings for one-* commands and command
++line tools
++
++# Revise version number
++
++To revise version number, update `VCONONE_VERSION` in `CmakeLists.txt`
++or give `-DVCONONE_VERSION=0x0000000100080001` at cmake configure step.
++
++Number given is four numbers `build`, `patch`, `minor` and `major` in order for
++each 16bit integers. `build` is not used for now.
++
++`0x0000000100080001` version is interpretered as `1.8.1`
+diff --git a/compiler/vconone/driver/driver.cpp b/compiler/vconone/driver/driver.cpp
+new file mode 100644
+index 0000000..12bd0ee
+--- /dev/null
++++ b/compiler/vconone/driver/driver.cpp
+@@ -0,0 +1,36 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *    http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#include <vconone/vconone.h>
++
++#include <string>
++#include <iostream>
++
++int main(int argc, char *argv[])
++{
++  auto str = vconone::get_string();
++  if (argc >= 2)
++  {
++    for (int c = 1; c < argc; ++c)
++      std::cout << argv[c] << " ";
++    std::cout << "version " << str << std::endl;
++    std::cout << vconone::get_copyright() << std::endl;
++  }
++  else
++    std::cout << str;
++
++  return 0;
++}
+diff --git a/compiler/vconone/include/vconone/vconone.h b/compiler/vconone/include/vconone/vconone.h
+new file mode 100644
+index 0000000..a6a1998
+--- /dev/null
++++ b/compiler/vconone/include/vconone/vconone.h
+@@ -0,0 +1,61 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *    http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#ifndef __VCON_ONE_H__
++#define __VCON_ONE_H__
++
++#include <cstdint>
++#include <string>
++
++namespace vconone
++{
++
++struct four
++{
++  uint16_t major;
++  uint16_t minor;
++  uint16_t patch;
++  uint16_t build; // build is not used for now
++};
++
++union version {
++  uint64_t v;
++  four f;
++};
++
++/**
++ * @brief get_number will return version union structure
++ */
++version get_number(void);
++
++/**
++ * @brief get_string will return string of major.minor.patch (without build)
++ */
++std::string get_string(void);
++
++/**
++ * @brief get_string4 will return string of major.minor.patch.build
++ */
++std::string get_string4(void);
++
++/**
++ * @brief get_copyright will return copyright string
++ */
++std::string get_copyright(void);
++
++} // namespace vconone
++
++#endif // __VCON_ONE_H__
+diff --git a/compiler/vconone/src/version.cpp b/compiler/vconone/src/version.cpp
+new file mode 100644
+index 0000000..9b693c6
+--- /dev/null
++++ b/compiler/vconone/src/version.cpp
+@@ -0,0 +1,63 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *    http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#include "vconone/vconone.h"
++
++#include "version_cfg.h"
++
++#include <sstream>
++
++namespace vconone
++{
++
++version get_number(void)
++{
++  version v;
++  v.v = VCONONE_VERSION;
++  return v;
++}
++
++std::string get_string4(void)
++{
++  std::ostringstream ss;
++
++  auto v = get_number();
++  ss << unsigned(v.f.major) << "." << unsigned(v.f.minor) << "." << unsigned(v.f.patch) << "."
++     << unsigned(v.f.build);
++
++  return ss.str();
++}
++
++std::string get_string(void)
++{
++  std::ostringstream ss;
++
++  auto v = get_number();
++  ss << unsigned(v.f.major) << "." << unsigned(v.f.minor) << "." << unsigned(v.f.patch);
++
++  return ss.str();
++}
++
++std::string get_copyright(void)
++{
++  std::string str;
++  str = "Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved\r\n";
++  str += "Licensed under the Apache License, Version 2.0\r\n";
++  str += "https://github.com/Samsung/ONE";
++  return str;
++}
++
++} // namespace vconone
+diff --git a/compiler/vconone/src/version.test.cpp b/compiler/vconone/src/version.test.cpp
+new file mode 100644
+index 0000000..35a0647
+--- /dev/null
++++ b/compiler/vconone/src/version.test.cpp
+@@ -0,0 +1,49 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *    http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#include <vconone/vconone.h>
++
++#include <gtest/gtest.h>
++
++TEST(vconone, version_number)
++{
++  auto v = vconone::get_number();
++
++  ASSERT_NE(0x0000000000000000ULL, v.v);
++}
++
++TEST(vconone, version_string)
++{
++  auto str = vconone::get_string();
++
++  ASSERT_NE("..", str);
++  ASSERT_NE("", str);
++}
++
++TEST(vconone, version_string4)
++{
++  auto str = vconone::get_string4();
++
++  ASSERT_NE("...", str);
++  ASSERT_NE("", str);
++}
++
++TEST(vconone, copyright)
++{
++  auto str = vconone::get_copyright();
++
++  ASSERT_NE("", str);
++}
+diff --git a/compiler/vconone/version_cfg.h.in b/compiler/vconone/version_cfg.h.in
+new file mode 100644
+index 0000000..aa3ad9e
+--- /dev/null
++++ b/compiler/vconone/version_cfg.h.in
+@@ -0,0 +1,22 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *    http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#ifndef __VCON_ONE_VERSION_CFG_H__
++#define __VCON_ONE_VERSION_CFG_H__
++
++#define VCONONE_VERSION @VCONONE_VERSION@ULL
++
++#endif // __VCON_ONE_VERSION_CFG_H__
+diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgOperationKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgOperationKernel.h
+deleted file mode 100644
+index 9699b5c..0000000
+--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgOperationKernel.h
++++ /dev/null
+@@ -1,124 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2016-2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-/**
+- * @file CLArgOperationKernel.h
+- * @brief This file defines CLArgOperationKernel
+- * @ingroup COM_AI_RUNTIME
+- */
+-
+-#ifndef __ARM_COMPUTE_CLARGOPERATIONKERNEL_H__
+-#define __ARM_COMPUTE_CLARGOPERATIONKERNEL_H__
+-
+-#include "arm_compute/core/CL/ICLKernel.h"
+-#include "arm_compute/core/TypesEx.h"
+-
+-namespace arm_compute
+-{
+-class ICLTensor;
+-
+-/**
+- * @brief Class to define interface for the argop kernel.
+- */
+-class CLArgOperationKernel : public ICLKernel
+-{
+-public:
+-  /**
+-   * @brief Default constructor.
+-   */
+-  CLArgOperationKernel();
+-  /**
+-   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+-   * @param [in] copiedInstance Const reference of CLArgOperationKernel to be copied
+-   */
+-  CLArgOperationKernel(const CLArgOperationKernel &) = delete;
+-  /**
+-   * @brief Prevent instances of this class from being copied (As this class contains pointers).
+-   * @param [in] copiedInstance Const reference of CLArgOperationKernel to be copied
+-   * @return Reference of this instance
+-   */
+-  CLArgOperationKernel &operator=(const CLArgOperationKernel &) = delete;
+-  /**
+-   * @brief Allow instances of this class to be moved
+-   * @param [in] movedInstance Rvalue reference of CLArgOperationKernel to be moved
+-   */
+-  CLArgOperationKernel(CLArgOperationKernel &&) = default;
+-  /**
+-   * @brief Allow instances of this class to be moved
+-   * @param [in] movedInstance Rvalue reference of CLArgOperationKernel to be moved
+-   * @return Reference of this instance
+-   */
+-  CLArgOperationKernel &operator=(CLArgOperationKernel &&) = default;
+-  /**
+-   * @brief Initialise the kernel's input, output and border mode.
+-   * @param[in]  input          An input tensor. Data types supported: U8/QASYMM8/S32/F32.
+-   * @param[out] output         The output tensor, Data types supported: S32.
+-   * @param[in]  axis           Axis along which to reduce. It must be sorted and no duplicates.
+-   * @param[in]  op             Arg operation to perform.
+-   * return N/A
+-   */
+-  void configure(const ICLTensor *input, ICLTensor *output, const uint32_t axis, ArgOperation op);
+-  /**
+-   * @brief Static function to check if given info will lead to a valid configuration of @ref
+-   * CLArgOperationKernel
+-   * @param[in] input           An input tensor info. Data types supported: U8/QASYMM8/S32/F32.
+-   * @param[in] output          The output tensor info, Data types supported: S32.
+-   * @param[in] axis            Axis along which to reduce. It must be sorted and no duplicates.
+-   * @param[in] op              Arg operation to perform.
+-   * @return a status
+-   */
+-  static Status validate(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis,
+-                         ArgOperation op);
+-
+-  /*
+-   * @brief Run CLArgOperationKernel op
+-   * @param[in] window  Window to be used for in_slice
+-   * @param[in] queue   cl::CommandQueue
+-   * @return N/A
+-   */
+-  void run(const Window &window, cl::CommandQueue &queue) override;
+-
+-private:
+-  const ICLTensor *_input;
+-  ICLTensor *_output;
+-  uint32_t _axis;
+-};
+-} // namespace arm_compute
+-#endif /*__ARM_COMPUTE_CLARGOPERATIONKERNEL_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h
+deleted file mode 100644
+index b0357fe..0000000
+--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h
++++ /dev/null
+@@ -1,121 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2016-2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-/**
+- * @file      CLCastKernel.h
+- * @ingroup   COM_AI_RUNTIME
+- * @brief     This file defines CLCastKernel class
+- */
+-
+-#ifndef __ARM_COMPUTE_CLCASTKERNEL_H__
+-#define __ARM_COMPUTE_CLCASTKERNEL_H__
+-
+-#include "arm_compute/core/CL/ICLKernel.h"
+-#include "arm_compute/core/TypesEx.h"
+-
+-namespace arm_compute
+-{
+-class ICLTensor;
+-
+-/**
+- * @brief Class to define OpenCL kernel for cast operation
+- */
+-class CLCastKernel : public ICLKernel
+-{
+-public:
+-  /**
+-   * @brief Construct CLCastKernel object
+-   */
+-  CLCastKernel();
+-
+-  /**
+-   * @brief Prevent instances of this class from being copied (As this class contains pointers)
+-   */
+-  CLCastKernel(const CLCastKernel &) = delete;
+-
+-  /**
+-   * @brief Prevent instances of this class from being copied (As this class contains pointers)
+-   */
+-  CLCastKernel &operator=(const CLCastKernel &) = delete;
+-
+-  /**
+-   * @brief Construct CLCastKernel object using default move constructor
+-   * @param[in] CLCastKernel object to move
+-   */
+-  CLCastKernel(CLCastKernel &&) = default;
+-
+-  /**
+-   * @brief Allow instances of this class to be moved
+-   * @param[in] CLCastKernel object to move
+-   */
+-  CLCastKernel &operator=(CLCastKernel &&) = default;
+-
+-  /**
+-   * @brief Destruct this CLCastKernel object
+-   */
+-  ~CLCastKernel() = default;
+-
+-  /**
+-   * @brief Initialise the kernel's input and output.
+-   * @param[in]  input  Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+-   * @param[in]  output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+-   * @param[in]  input_subtype  Sub data type of input.
+-   * @return N/A
+-   */
+-  void configure(const ICLTensor *input, ICLTensor *output, SubDataType input_subtype);
+-
+-  /**
+-   * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command
+-   *        queue.
+-   * @note  The queue is *not* flushed by this method, and therefore the kernel will not have
+-   *        been executed by the time this method returns.
+-   * @param[in] window      Region on which to execute the kernel. (Must be a valid region of
+-   *                        the window returned by window()).
+-   * @param[in,out] queue   Command queue on which to enqueue the kernel.@return N/A
+-   * @return N/A
+-   */
+-  void run(const Window &window, cl::CommandQueue &queue) override;
+-
+-private:
+-  const ICLTensor *_input; /**< Source tensor */
+-  ICLTensor *_output;      /**< Destination tensor */
+-};
+-} // namespace arm_compute
+-#endif /* __ARM_COMPUTE_CLCASTKERNEL_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h
+deleted file mode 100644
+index 8615cf1..0000000
+--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h
++++ /dev/null
+@@ -1,82 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2016-2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#ifndef __ARM_COMPUTE_CLDEPTHTOSPACEKERNEL_H__
+-#define __ARM_COMPUTE_CLDEPTHTOSPACEKERNEL_H__
+-
+-#include "arm_compute/core/CL/ICLKernel.h"
+-
+-namespace arm_compute
+-{
+-class ICLTensor;
+-
+-/** OpenCL kernel to perform depthTospace operation */
+-class CLDepthToSpaceKernel : public ICLKernel
+-{
+-public:
+-  /** Default constructor */
+-  CLDepthToSpaceKernel();
+-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+-  CLDepthToSpaceKernel(const CLDepthToSpaceKernel &) = delete;
+-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+-  CLDepthToSpaceKernel &operator=(const CLDepthToSpaceKernel &) = delete;
+-  /** Allow instances of this class to be moved */
+-  CLDepthToSpaceKernel(CLDepthToSpaceKernel &&) = default;
+-  /** Allow instances of this class to be moved */
+-  CLDepthToSpaceKernel &operator=(CLDepthToSpaceKernel &&) = default;
+-  /** Default destructor */
+-  ~CLDepthToSpaceKernel() = default;
+-  /** Initialise the kernel's input and output.
+-   *
+-   * @param[in]  input  Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+-   * @param[in]  output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+-   */
+-  void configure(const ICLTensor *input, ICLTensor *output, const int32_t block_size);
+-
+-  // Inherited methods overridden:
+-  void run(const Window &window, cl::CommandQueue &queue) override;
+-
+-private:
+-  const ICLTensor *_input; /**< Source tensor */
+-  ICLTensor *_output;      /**< Destination tensor */
+-};
+-
+-} // namespace arm_compute
+-#endif /* __ARM_COMPUTE_CLDEPTHTOSPACEKERNEL_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.h
+deleted file mode 100644
+index 9321c36..0000000
+--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.h
++++ /dev/null
+@@ -1,117 +0,0 @@
+-/*
+- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2017-2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#ifndef __ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYKERNELEX_H__
+-#define __ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYKERNELEX_H__
+-
+-#include "arm_compute/core/CL/ICLKernel.h"
+-
+-namespace arm_compute
+-{
+-class ICLTensor;
+-
+-/** OpenCL kernel to multiply matrices
+- *
+- * @note This kernel should be used ONLY for Midgard architectures
+- *
+- * This kernel performs the following computation:
+- *
+- *  -# Convert a values from int8 to int32
+- *  -# Convert b values from int8 to int32
+- *  -# Compute the int32 matrix product of the resulting a * b and store the result as int32
+- *
+- */
+-class CLGEMMLowpMatrixMultiplyKernelEx : public ICLKernel
+-{
+-public:
+-  /** Default Constructor */
+-  CLGEMMLowpMatrixMultiplyKernelEx();
+-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+-  CLGEMMLowpMatrixMultiplyKernelEx(const CLGEMMLowpMatrixMultiplyKernelEx &) = delete;
+-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+-  CLGEMMLowpMatrixMultiplyKernelEx &operator=(const CLGEMMLowpMatrixMultiplyKernelEx &) = delete;
+-  /** Allow instances of this class to be moved */
+-  CLGEMMLowpMatrixMultiplyKernelEx(CLGEMMLowpMatrixMultiplyKernelEx &&) = default;
+-  /** Allow instances of this class to be moved */
+-  CLGEMMLowpMatrixMultiplyKernelEx &operator=(CLGEMMLowpMatrixMultiplyKernelEx &&) = default;
+-  /** Initialise the kernel's input and output.
+-   *
+-   * @note This kernel should be used ONLY for Midgard architectures
+-   *
+-   * @param[in]  input0    Input tensor containing the LHS matrix. Data type supported: QASYMM8
+-   * @param[in]  input1    Input tensor containing the RHS matrix. Data type supported: same as @p
+-   * input0
+-   * @param[out] output    Output tensor to store the result of matrix multiplication. Data type
+-   * supported: S32
+-   * @param[in]  gemm_info (Optional) GEMM information used to retrieve the original dimensions of
+-   * the input matrices
+-   */
+-  void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output,
+-                 const GEMMReshapeInfo &gemm_info = GEMMReshapeInfo());
+-  /** Static function to check if given info will lead to a valid configuration of @ref
+-   * CLGEMMLowpMatrixMultiplyKernelEx
+-   *
+-   * @param[in] input0    Input tensor containing the LHS matrix. Data type supported: QASYMM8
+-   * @param[in] input1    Input tensor containing the RHS matrix. Data type supported: same as @p
+-   * input0
+-   * @param[in] output    Output tensor to store the result of matrix multiplication. Data type
+-   * supported: S32
+-   * @param[in] gemm_info (Optional) GEMM information used to retrieve the original dimensions of
+-   * the input matrices
+-   *
+-   * @return a status
+-   */
+-  static Status validate(const ITensorInfo *input0, const ITensorInfo *input1,
+-                         const ITensorInfo *output,
+-                         const GEMMReshapeInfo &gemm_info = GEMMReshapeInfo());
+-
+-  // Inherited methods overridden:
+-  void run(const Window &window, cl::CommandQueue &queue) override;
+-
+-private:
+-  const ICLTensor *_input0;
+-  const ICLTensor *_input1;
+-  ICLTensor *_output;
+-  bool _slide_matrix_b;
+-  bool _reinterpret_input_as_3d;
+-  bool _reinterpret_output_as_3d;
+-};
+-} // namespace arm_compute
+-#endif /*__ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYKERNELEX_H__*/
+diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLPReLUKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLPReLUKernel.h
+deleted file mode 100644
+index dd2dbf6..0000000
+--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLPReLUKernel.h
++++ /dev/null
+@@ -1,83 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2016-2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#ifndef __ARM_COMPUTE_CLPRELU_KERNEL_H__
+-#define __ARM_COMPUTE_CLPRELU_KERNEL_H__
+-
+-#include "arm_compute/core/CL/ICLKernel.h"
+-
+-namespace arm_compute
+-{
+-class ICLTensor;
+-
+-/** OpenCL kernel to calculate PReLU*/
+-class CLPReLUKernel : public ICLKernel
+-{
+-public:
+-  /** Default constructor */
+-  CLPReLUKernel();
+-  /** Prevent instances of this class from being copied (As this class contains pointers). */
+-  CLPReLUKernel(const CLPReLUKernel &) = delete;
+-  /** Prevent instances of this class from being copied (As this class contains pointers). */
+-  CLPReLUKernel &operator=(const CLPReLUKernel &) = delete;
+-  /** Allow instances of this class to be moved */
+-  CLPReLUKernel(CLPReLUKernel &&) = default;
+-  /** Allow instances of this class to be moved */
+-  CLPReLUKernel &operator=(CLPReLUKernel &&) = default;
+-  /** Initialize the kernel's input, output.
+-   *
+-   * @param[in]  input  Source tensor1.
+-   * @param[in]  alpha  Source tensor2.
+-   * @param[out] output  Output tensor.
+-   */
+-  void configure(const ICLTensor *input, const ICLTensor *alpha, ICLTensor *output);
+-
+-  // Inherited methods overridden:
+-  void run(const Window &window, cl::CommandQueue &queue) override;
+-
+-  BorderSize border_size() const override;
+-
+-private:
+-  const ICLTensor *_input;
+-  const ICLTensor *_alpha;
+-  ICLTensor *_output;
+-};
+-} // namespace arm_compute
+-#endif /*__ARM_COMPUTE_CLPRELU_KERNEL_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h
+deleted file mode 100644
+index 4c0a82c..0000000
+--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h
++++ /dev/null
+@@ -1,82 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2016-2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#ifndef __ARM_COMPUTE_CLSPACETODEPTHKERNEL_H__
+-#define __ARM_COMPUTE_CLSPACETODEPTHKERNEL_H__
+-
+-#include "arm_compute/core/CL/ICLKernel.h"
+-
+-namespace arm_compute
+-{
+-class ICLTensor;
+-
+-/** OpenCL kernel to perform spaceTodepth operation */
+-class CLSpaceToDepthKernel : public ICLKernel
+-{
+-public:
+-  /** Default constructor */
+-  CLSpaceToDepthKernel();
+-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+-  CLSpaceToDepthKernel(const CLSpaceToDepthKernel &) = delete;
+-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+-  CLSpaceToDepthKernel &operator=(const CLSpaceToDepthKernel &) = delete;
+-  /** Allow instances of this class to be moved */
+-  CLSpaceToDepthKernel(CLSpaceToDepthKernel &&) = default;
+-  /** Allow instances of this class to be moved */
+-  CLSpaceToDepthKernel &operator=(CLSpaceToDepthKernel &&) = default;
+-  /** Default destructor */
+-  ~CLSpaceToDepthKernel() = default;
+-  /** Initialise the kernel's input and output.
+-   *
+-   * @param[in]  input  Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+-   * @param[in]  output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+-   */
+-  void configure(const ICLTensor *input, ICLTensor *output, const int32_t block_size);
+-
+-  // Inherited methods overridden:
+-  void run(const Window &window, cl::CommandQueue &queue) override;
+-
+-private:
+-  const ICLTensor *_input; /**< Source tensor */
+-  ICLTensor *_output;      /**< Destination tensor */
+-};
+-
+-} // namespace arm_compute
+-#endif /* __ARM_COMPUTE_CLSPACETODEPTHKERNEL_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h
+deleted file mode 100644
+index 9d174de..0000000
+--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h
++++ /dev/null
+@@ -1,109 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2016-2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#ifndef __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLEKERNEL_H__
+-#define __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLEKERNEL_H__
+-
+-#include "arm_compute/core/CL/ICLKernel.h"
+-
+-namespace arm_compute
+-{
+-class ICLTensor;
+-
+-/** Interface for the Upsampling layer kernel for transpose convolution on OpenCL.
+- */
+-class CLTransposeConvLayerUpsampleKernel : public ICLKernel
+-{
+-public:
+-  /** Constructor */
+-  CLTransposeConvLayerUpsampleKernel();
+-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+-  CLTransposeConvLayerUpsampleKernel(const CLTransposeConvLayerUpsampleKernel &) = delete;
+-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+-  CLTransposeConvLayerUpsampleKernel &
+-  operator=(const CLTransposeConvLayerUpsampleKernel &) = delete;
+-  /** Default Move Constructor. */
+-  CLTransposeConvLayerUpsampleKernel(CLTransposeConvLayerUpsampleKernel &&) = default;
+-  /** Default move assignment operator */
+-  CLTransposeConvLayerUpsampleKernel &operator=(CLTransposeConvLayerUpsampleKernel &&) = default;
+-  /** Default destructor */
+-  ~CLTransposeConvLayerUpsampleKernel() = default;
+-
+-  /** Initialise the kernel's input and output.
+-   *
+-   * @param[in]  input        Source tensor. Data types supported: QASYMM8/F16/F32.
+-   * @param[out] output       Destination tensor. Data types supported: same as @p input. All but
+-   * the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only
+-   * performed within the XY-plane.
+-   * @param[in]  inner_border Top and right inner border sizes. These rows and columns will be
+-   * filled with zero.
+-   * @param[in]  info         Contains padding and stride information described in @ref
+-   * PadStrideInfo.
+-   */
+-  void configure(const ICLTensor *input, ICLTensor *output, const BorderSize &inner_border,
+-                 const PadStrideInfo &info);
+-  /** Static function to check if given info will lead to a valid configuration of @ref
+-   * CLTransposeConvLayerUpsample
+-   *
+-   * @param[in] input        Source tensor info. Data types supported: QASYMM8/F16/F32.
+-   * @param[in] output       Destination tensor info. Data types supported: same as @p input. All
+-   * but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is
+-   * only performed within the XY-plane.
+-   * @param[in] inner_border Top and right inner border sizes. These rows and columns will be filled
+-   * with zero.
+-   * @param[in] info         Contains padding and stride information described in @ref
+-   * PadStrideInfo.
+-   *
+-   * @return a status
+-   */
+-  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+-                         const BorderSize &inner_border, const PadStrideInfo &info);
+-
+-  // Inherited methods overridden:
+-  void run(const Window &window, cl::CommandQueue &queue) override;
+-
+-private:
+-  const ICLTensor *_input;
+-  ICLTensor *_output;
+-  BorderSize _inner_border;
+-  PadStrideInfo _info;
+-};
+-} // namespace arm_compute
+-#endif /*__ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLEKERNEL_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h b/compute/ARMComputeEx/arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h
+deleted file mode 100644
+index d4c9c61..0000000
+--- a/compute/ARMComputeEx/arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h
++++ /dev/null
+@@ -1,88 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2017-2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#ifndef __ARM_COMPUTE_CPPUPSAMPLEKERNEL_EX_H__
+-#define __ARM_COMPUTE_CPPUPSAMPLEKERNEL_EX_H__
+-
+-#include "arm_compute/core/CPP/ICPPKernel.h"
+-
+-namespace arm_compute
+-{
+-class ITensor;
+-
+-/** CPP kernel to perform tensor upsample.
+- *
+- */
+-class CPPUpsampleKernelEx : public ICPPKernel
+-{
+-public:
+-  const char *name() const override { return "CPPUpsampleKernelEx"; }
+-  /** Default constructor */
+-  CPPUpsampleKernelEx();
+-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+-  CPPUpsampleKernelEx(const CPPUpsampleKernelEx &) = delete;
+-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+-  CPPUpsampleKernelEx &operator=(const CPPUpsampleKernelEx &) = delete;
+-  /** Allow instances of this class to be moved */
+-  CPPUpsampleKernelEx(CPPUpsampleKernelEx &&) = default;
+-  /** Allow instances of this class to be moved */
+-  CPPUpsampleKernelEx &operator=(CPPUpsampleKernelEx &&) = default;
+-  /** Default destructor */
+-  ~CPPUpsampleKernelEx() = default;
+-
+-  /** Set the input and output of the kernel.
+-   *
+-   * @param[in]  input  The input tensor to upsample. Data types supported: F32/F16/QASYMM8
+-   * @param[out] output The output tensor. Data types supported: Same as @p input
+-   * @param[in]  info   Padding info.
+-   */
+-  void configure(const ITensor *input, ITensor *output, const PadStrideInfo &info);
+-
+-  // Inherited methods overridden:
+-  void run(const Window &window, const ThreadInfo &info) override;
+-  bool is_parallelisable() const override;
+-
+-private:
+-  const ITensor *_input;
+-  ITensor *_output;
+-  PadStrideInfo _info;
+-};
+-} // namespace arm_compute
+-#endif /*__ARM_COMPUTE_CPPUPSAMPLEKERNEL_EX_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastKernel.h
+deleted file mode 100644
+index 4e9f097..0000000
+--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastKernel.h
++++ /dev/null
+@@ -1,96 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2017-2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#ifndef __ARM_COMPUTE_NECASTKERNEL_H__
+-#define __ARM_COMPUTE_NECASTKERNEL_H__
+-
+-#include "arm_compute/core/NEON/INEKernel.h"
+-#include "arm_compute/core/TypesEx.h"
+-
+-namespace arm_compute
+-{
+-class ITensor;
+-
+-/** Interface for the cast layer kernel. */
+-class NECastKernel : public INEKernel
+-{
+-public:
+-  const char *name() const override { return "NECastKernel"; }
+-  /** Default constructor */
+-  NECastKernel();
+-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+-  NECastKernel(const NECastKernel &) = delete;
+-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+-  NECastKernel &operator=(const NECastKernel &) = delete;
+-  /** Default Move Constructor. */
+-  NECastKernel(NECastKernel &&) = default;
+-  /** Default move assignment operator */
+-  NECastKernel &operator=(NECastKernel &&) = default;
+-  /** Default destructor */
+-  ~NECastKernel() = default;
+-  /** Set input, output tensors.
+-   *
+-   * @param[in]  input  Source tensor. Data type supported: U8/S8/QASYMM8/U32/S32/F32.
+-   * @param[out] output Destination tensor with the same dimensions of input. Data type supported:
+-   * U8/S8/QASYMM8/U32/S32/F32.
+-   * @param[in]  input_subtype  Sub data type of input.
+-   */
+-  void configure(const ITensor *input, ITensor *output, SubDataType input_subtype);
+-  /** Static function to check if given info will lead to a valid configuration of @ref NECastKernel
+-   *
+-   * @param[in] input  Input tensor info. Data types supported: U8/S8/QASYMM8/U32/S32/F32.
+-   * @param[in] output Output tensor info. Data types supported: U8/S8/QASYMM8/U32/S32/F32.
+-   * @param[in] input_subtype  Sub data type of input.
+-   *
+-   * @return a status
+-   */
+-  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+-                         SubDataType input_subtype);
+-
+-  // Inherited methods overridden:
+-  void run(const Window &window, const ThreadInfo &info) override;
+-
+-private:
+-  const ITensor *_input;
+-  ITensor *_output;
+-  SubDataType _input_subtype;
+-};
+-} // namespace arm_compute
+-#endif /*__ARM_COMPUTE_NECASTKERNEL_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h
+deleted file mode 100644
+index b62897e..0000000
+--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h
++++ /dev/null
+@@ -1,96 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#ifndef __ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNELEX_H__
+-#define __ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNELEX_H__
+-
+-#include "arm_compute/core/NEON/INEKernel.h"
+-
+-namespace arm_compute
+-{
+-class ITensor;
+-
+-/** Interface for the depth to space kernel */
+-class NEDepthToSpaceLayerKernelEx : public INEKernel
+-{
+-public:
+-  const char *name() const override { return "NEDepthToSpaceLayerKernelEx"; }
+-  /** Default constructor */
+-  NEDepthToSpaceLayerKernelEx();
+-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+-  NEDepthToSpaceLayerKernelEx(const NEDepthToSpaceLayerKernelEx &) = delete;
+-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+-  NEDepthToSpaceLayerKernelEx &operator=(const NEDepthToSpaceLayerKernelEx &) = delete;
+-  /** Allow instances of this class to be moved */
+-  NEDepthToSpaceLayerKernelEx(NEDepthToSpaceLayerKernelEx &&) = default;
+-  /** Allow instances of this class to be moved */
+-  NEDepthToSpaceLayerKernelEx &operator=(NEDepthToSpaceLayerKernelEx &&) = default;
+-  /** Default destructor */
+-  ~NEDepthToSpaceLayerKernelEx() = default;
+-  /** Initialise the kernel's inputs and output.
+-   *
+-   * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported:
+-   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+-   * @param[out] output      Tensor output. Data types supported: same as @p input
+-   * @param[in]  block_shape Block shape x value.
+-   */
+-  void configure(const ITensor *input, ITensor *output, int32_t block_shape);
+-  /** Static function to check if given info will lead to a valid configuration of @ref
+-   * NEDepthToSpaceLayerKernelEx.
+-   *
+-   * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported:
+-   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+-   * @param[in] output      Tensor output info. Data types supported: same as @p input
+-   * @param[in] block_shape Block shape value.
+-   *
+-   * @return a status
+-   */
+-  static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
+-
+-  // Inherited methods overridden:
+-  void run(const Window &window, const ThreadInfo &info) override;
+-
+-private:
+-  const ITensor *_input; /**< Source tensor */
+-  ITensor *_output;      /**< Destination tensor */
+-  int32_t _block_shape;  /**< Block shape */
+-};
+-} // namespace arm_compute
+-#endif /* __ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNELEX_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h
+deleted file mode 100644
+index 57de78d..0000000
+--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h
++++ /dev/null
+@@ -1,118 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2018-2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#ifndef __ARM_COMPUTE_NEELEMENTWISEUNARYKERNELEX_H__
+-#define __ARM_COMPUTE_NEELEMENTWISEUNARYKERNELEX_H__
+-
+-#include "arm_compute/core/NEON/INEKernel.h"
+-#include "arm_compute/core/TypesEx.h"
+-
+-namespace arm_compute
+-{
+-class ITensor;
+-
+-/** Interface for an element-wise unary operation kernel
+- *
+- * Element-wise operation is computed by:
+- * @f[ output(x) = OP(input(x))@f]
+- *
+- */
+-class NEElementwiseUnaryKernelEx : public INEKernel
+-{
+-public:
+-  const char *name() const override { return "NEElementwiseUnaryKernelEx"; }
+-  /** Default constructor */
+-  NEElementwiseUnaryKernelEx();
+-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+-  NEElementwiseUnaryKernelEx(const NEElementwiseUnaryKernelEx &) = delete;
+-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+-  NEElementwiseUnaryKernelEx &operator=(const NEElementwiseUnaryKernelEx &) = delete;
+-  /** Allow instances of this class to be moved */
+-  NEElementwiseUnaryKernelEx(NEElementwiseUnaryKernelEx &&) = default;
+-  /** Allow instances of this class to be moved */
+-  NEElementwiseUnaryKernelEx &operator=(NEElementwiseUnaryKernelEx &&) = default;
+-  /** Default destructor */
+-  ~NEElementwiseUnaryKernelEx() = default;
+-
+-  /** Static function to check if given info will lead to a valid configuration of @ref
+-   * NEElementwiseUnaryKernelEx
+-   *
+-   * @param[in] op     Arithmetic operation to be executed.
+-   * @param[in] input  First tensor input. Data types supported: F16/F32/S32.
+-   * @param[in] output Output tensor. Data types supported: Same as @p input.
+-   */
+-  void configure(ElementWiseUnaryEx op, const ITensor *input, ITensor *output);
+-
+-  /** Static function to check if given info will lead to a valid configuration of @ref
+-   * NEElementwiseUnaryKernelEx
+-   *
+-   * @param[in] op     Arithmetic operation to be executed.
+-   * @param[in] input  First tensor input info. Data types supported: F16/F32/S32.
+-   * @param[in] output Output tensor info. Data types supported: Same as @p input.
+-   *
+-   * @return a Status
+-   */
+-  static Status validate(ElementWiseUnaryEx op, const ITensorInfo *input,
+-                         const ITensorInfo *output);
+-
+-  // Inherited methods overridden:
+-  void run(const Window &window, const ThreadInfo &info) override;
+-
+-  /** Common signature for all the specialised arithmetic functions
+-   *
+-   * @param[in]  input  An input tensor. Data types supported: F16/F32/S32.
+-   * @param[out] output The output tensor. Data types supported: Same as @p input.
+-   * @param[in]  window Region on which to execute the kernel.
+-   */
+-  using ElementwiseUnaryFunction = void(const ITensor *input, ITensor *output,
+-                                        const Window &window);
+-
+-protected:
+-  // Inherited methods overridden:
+-  static Status validate_arguments(const ITensorInfo &input, const ITensorInfo &output);
+-
+-  /** Function to use for the particular tensor types passed to configure() */
+-  std::function<void(const ITensor *input, ITensor *output, const Window &window)> _function;
+-
+-  const ITensor *_input;
+-  ITensor *_output;
+-};
+-} // namespace arm_compute
+-#endif /* __ARM_COMPUTE_NEELEMENTWISEUNARYKERNELEX_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEPReLUKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEPReLUKernel.h
+deleted file mode 100644
+index 722efd3..0000000
+--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEPReLUKernel.h
++++ /dev/null
+@@ -1,100 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2016-2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#ifndef __ARM_COMPUTE_NEPRELUKERNEL_H__
+-#define __ARM_COMPUTE_NEPRELUKERNEL_H__
+-
+-#include "arm_compute/core/NEON/INEKernel.h"
+-
+-namespace arm_compute
+-{
+-class ITensor;
+-
+-/** Interface for the kernel to perform Parametric Rectified Linear Unit
+- *
+- * Result is computed by:
+- * @f[ output(x) = alpha * x for x < 0, output(x) = x for x >= 0 @f]
+- */
+-class NEPReLUKernel : public INEKernel
+-{
+-public:
+-  const char *name() const override { return "NEPReLUKernel"; }
+-  /** Default constructor */
+-  NEPReLUKernel();
+-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+-  NEPReLUKernel(const NEPReLUKernel &) = delete;
+-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+-  NEPReLUKernel &operator=(const NEPReLUKernel &) = delete;
+-  /** Allow instances of this class to be moved */
+-  NEPReLUKernel(NEPReLUKernel &&) = default;
+-  /** Allow instances of this class to be moved */
+-  NEPReLUKernel &operator=(NEPReLUKernel &&) = default;
+-  /** Initialise the kernel's inputs and output
+-   *
+-   * @param[in]  input Input tensor. Data type supported: QASYMM8/F32
+-   * @param[in]  alpha Alpha tensor. Data types supported: Same as @p input
+-   * @param[out] output Output tensor. Data types supported: Same as @p input
+-   */
+-  void configure(const ITensor *input, const ITensor *alpha, ITensor *output);
+-
+-  // Inherited methods overridden:
+-  void run(const Window &window, const ThreadInfo &info) override;
+-
+-  /** Static function to check if given info will lead to a valid configuration of @ref
+-   * NEPReLUKernel.h
+-   *
+-   * @param[in] input  Input tensor input info. Data types supported: QASYMM8/F32.
+-   * @param[in] alpha  Alpha tensor input info. Data types supported: Same as @p input.
+-   * @param[in] output Output tensor info. Data types supported: Same as @p input.
+-   *
+-   * @return a Status
+-   */
+-  static Status validate(const ITensorInfo *input, const ITensorInfo *alpha,
+-                         const ITensorInfo *output);
+-  static Status validate_arguments(const ITensorInfo &input, const ITensorInfo &alpha,
+-                                   const ITensorInfo &output);
+-
+-private:
+-  const ITensor *_input; /**< Source tensor */
+-  const ITensor *_alpha; /**< Alpha tensor */
+-  ITensor *_output;      /**< Destination tensor */
+-};
+-} // namespace arm_compute
+-#endif /* __ARM_COMPUTE_NEPRELUKERNEL_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h
+deleted file mode 100644
+index 0ffcf6b..0000000
+--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h
++++ /dev/null
+@@ -1,97 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#ifndef __ARM_COMPUTE_NESPACETODEPTHLAYERKERNELEX_H__
+-#define __ARM_COMPUTE_NESPACETODEPTHLAYERKERNELEX_H__
+-
+-#include "arm_compute/core/NEON/INEKernel.h"
+-#include "arm_compute/core/Types.h"
+-
+-namespace arm_compute
+-{
+-class ITensor;
+-
+-/** Interface for the space to depth kernel */
+-class NESpaceToDepthLayerKernelEx : public INEKernel
+-{
+-public:
+-  const char *name() const override { return "NESpaceToDepthLayerKernelEx"; }
+-  /** Default constructor */
+-  NESpaceToDepthLayerKernelEx();
+-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+-  NESpaceToDepthLayerKernelEx(const NESpaceToDepthLayerKernelEx &) = delete;
+-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+-  NESpaceToDepthLayerKernelEx &operator=(const NESpaceToDepthLayerKernelEx &) = delete;
+-  /** Allow instances of this class to be moved */
+-  NESpaceToDepthLayerKernelEx(NESpaceToDepthLayerKernelEx &&) = default;
+-  /** Allow instances of this class to be moved */
+-  NESpaceToDepthLayerKernelEx &operator=(NESpaceToDepthLayerKernelEx &&) = default;
+-  /** Default destructor */
+-  ~NESpaceToDepthLayerKernelEx() = default;
+-  /** Initialise the kernel's inputs and output.
+-   *
+-   * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported:
+-   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+-   * @param[out] output      Tensor output. Data types supported: same as @p input
+-   * @param[in]  block_shape Block shape value
+-   */
+-  void configure(const ITensor *input, ITensor *output, int32_t block_shape);
+-  /** Static function to check if given info will lead to a valid configuration of @ref
+-   * NESpaceToDepthLayerKernelEx
+-   *
+-   * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported:
+-   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+-   * @param[in] output      Tensor output info. Data types supported: same as @p input
+-   * @param[in] block_shape Block shape value
+-   *
+-   * @return a status
+-   */
+-  static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
+-
+-  // Inherited methods overridden:
+-  void run(const Window &window, const ThreadInfo &info) override;
+-
+-private:
+-  const ITensor *_input; /**< Source tensor */
+-  ITensor *_output;      /**< Destination tensor */
+-  int32_t _block_shape;  /**< Block shape */
+-};
+-} // namespace arm_compute
+-#endif /* __ARM_COMPUTE_NESPACETODEPTHLAYERKERNELEX_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h
+index 97bc4ce..cfbd134 100644
+--- a/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h
++++ b/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h
+@@ -16,25 +16,14 @@
+ #ifndef __ARM_COMPUTE_CLFUNCTIONSEX_H__
+ #define __ARM_COMPUTE_CLFUNCTIONSEX_H__
+ 
+-#include <arm_compute/runtime/CL/functions/CLArgOperation.h>
+-#include <arm_compute/runtime/CL/functions/CLBatchToSpaceND.h>
+ #include <arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h>
+-#include <arm_compute/runtime/CL/functions/CLCast.h>
+-#include <arm_compute/runtime/CL/functions/CLDepthToSpace.h>
+ #include <arm_compute/runtime/CL/functions/CLEmbeddingLookup.h>
+ #include <arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h>
+ #include <arm_compute/runtime/CL/functions/CLGatherEx.h>
+ #include <arm_compute/runtime/CL/functions/CLHashtableLookup.h>
+ #include <arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h>
+-#include <arm_compute/runtime/CL/functions/CLLogicalNot.h>
+ #include <arm_compute/runtime/CL/functions/CLNeg.h>
+-#include <arm_compute/runtime/CL/functions/CLPixelWiseDivision.h>
+-#include <arm_compute/runtime/CL/functions/CLPReLU.h>
+ #include <arm_compute/runtime/CL/functions/CLReduceOperation.h>
+-#include <arm_compute/runtime/CL/functions/CLRNNLayerEx.h>
+-#include <arm_compute/runtime/CL/functions/CLSpaceToDepth.h>
+-#include <arm_compute/runtime/CL/functions/CLSplit.h>
+-#include <arm_compute/runtime/CL/functions/CLStridedSliceEx.h>
+ #include <arm_compute/runtime/CL/functions/CLTopKV2.h>
+ #include <arm_compute/runtime/CL/functions/CLTransposeConvLayer.h>
+ 
+diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgOperation.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgOperation.h
+deleted file mode 100644
+index c37096f..0000000
+--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgOperation.h
++++ /dev/null
+@@ -1,129 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2017 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-/**
+- * @file CLArgOperation.h
+- * @ingroup COM_AI_RUNTIME
+- * @brief This file contains arm_compute::CLArgOperation class
+- */
+-
+-#ifndef __ARM_COMPUTE_CLARGOPERATION_H__
+-#define __ARM_COMPUTE_CLARGOPERATION_H__
+-
+-#include "arm_compute/core/CL/kernels/CLArgOperationKernel.h"
+-#include "arm_compute/runtime/CL/CLTensor.h"
+-#include "arm_compute/runtime/IFunction.h"
+-#include "arm_compute/core/TypesEx.h"
+-
+-namespace arm_compute
+-{
+-class ICLTensor;
+-
+-/**
+- * @brief Class to execute CLArgOperation operation
+- */
+-class CLArgOperation : public IFunction
+-{
+-public:
+-  /**
+-   * @brief Construct a new CLArgOperation object
+-   */
+-  CLArgOperation();
+-
+-  /**
+-   * @brief Prevent instances of this class from being copied (As this class contains pointers)
+-   */
+-  CLArgOperation(const CLArgOperation &) = delete;
+-
+-  /**
+-   * @brief Prevent instances of this class from being copied (As this class contains pointers)
+-   */
+-  CLArgOperation &operator=(const CLArgOperation &) = delete;
+-
+-  /**
+-   * @brief Construct a new CLArgOperation object by using copy constructor
+-   * @param[in] CLArgOperation object to move
+-   */
+-  CLArgOperation(CLArgOperation &&) = default;
+-
+-  /**
+-   * @brief Assign a CLArgOperation object.
+-   * @param[in] CLArgOperation object to assign. This object will be moved.
+-   */
+-  CLArgOperation &operator=(CLArgOperation &&) = default;
+-
+-  /**
+-   * @brief Initialise the kernel's inputs and outputs.
+-   * @param[in]  input     Input tensor. Data types supported: U8/QASYMM8/S32/F32.
+-   * @param[out] output    The result of arg operation. Data types supported: S32.
+-   * @param[in]  axis      Axis along which to reduce. It must be sorted and no duplicates.
+-   * @param[in]  op        Arg operation to perform.
+-   * @return N/A
+-   */
+-  void configure(ICLTensor *input, ICLTensor *output, std::vector<uint32_t> axis, ArgOperation op);
+-
+-  /**
+-   * @brief Static function to check if given info will lead to a valid configuration
+-   * @param[in]  input     Input tensor. Data types supported: U8/QASYMM8/S32/F32.
+-   * @param[in]  axis      Axis along which to reduce. It must be sorted and no duplicates.
+-   * @param[out] output    The result of arg operation. Data types supported: S32.
+-   * @param[in]  op        Arg operation to perform.
+-   * @return a status
+-   */
+-  static Status validate(const ITensorInfo *input, const std::vector<uint32_t> &axis,
+-                         const ITensorInfo *output, ArgOperation op);
+-  /**
+-   * @brief Run the OpenCL kernel for this operation
+-   * @return N/A
+-   */
+-  void run() override;
+-
+-private:
+-  ICLTensor *_input{nullptr};
+-  ICLTensor *_output{nullptr};
+-  std::vector<uint32_t> _axis{};
+-  ArgOperation _arg_op{ArgOperation::MAX};
+-
+-  std::unique_ptr<CLTensor[]> _interm_tensors{nullptr};
+-  std::unique_ptr<CLArgOperationKernel[]> _argop_kernels{nullptr};
+-  size_t _num_of_kernels{0};
+-};
+-}
+-#endif /*__ARM_COMPUTE_CLARGOPERATION_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBatchToSpaceND.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBatchToSpaceND.h
+deleted file mode 100644
+index eed5cb8..0000000
+--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBatchToSpaceND.h
++++ /dev/null
+@@ -1,69 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2016-2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#ifndef __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_H__
+-#define __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_H__
+-
+-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+-
+-namespace arm_compute
+-{
+-class ICLTensor;
+-
+-/** Basic function to run @ref CLBatchToSpaceNDKernel
+- *
+- * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32.
+- * @note The function converts the input tensor to the tensor of the output tensor's type.
+- */
+-class CLBatchToSpaceND : public ICLSimpleFunction
+-{
+-public:
+-  /** Initialise the kernel's input and output.
+-   *
+-   * @param[in]  input              Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+-   * @param[out] output             Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+-   * @param[in]  block_size         A pointer to an array of integer values specifying block sizes
+-   *                                for spatial dimension.
+-   */
+-  void configure(ICLTensor *input, ICLTensor *output, const int32_t *block_size);
+-};
+-
+-} // namespace arm_compute
+-#endif /* __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h
+deleted file mode 100644
+index ebe0d8a..0000000
+--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h
++++ /dev/null
+@@ -1,75 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2016-2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-/**
+- * @file CLCast.h
+- * @ingroup COM_AI_RUNTIME
+- * @brief This file contains arm_compute::CLCast class
+- */
+-
+-#ifndef __ARM_COMPUTE_CLCAST_H__
+-#define __ARM_COMPUTE_CLCAST_H__
+-
+-#include "arm_compute/core/TypesEx.h"
+-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+-
+-namespace arm_compute
+-{
+-class ICLTensor;
+-
+-/**
+- * @brief Class to run @ref CLCastKernel.
+- * This converts the input tensor to the tensor of the output tensor's type.
+- */
+-class CLCast : public ICLSimpleFunction
+-{
+-public:
+-  /**
+-   * @brief Initialise the kernel's input and output
+-   * @param[in, out] input    Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+-   *                          The input tensor is [in, out] because its TensorInfo might be
+-   *                          modified inside the kernel.
+-   * @param[out]     output   Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+-   * @param[in]      input_subtype  Sub data type of input.
+-   */
+-  void configure(ICLTensor *input, ICLTensor *output, SubDataType input_subtype);
+-};
+-}
+-#endif /* __ARM_COMPUTE_CLCAST_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDepthToSpace.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDepthToSpace.h
+deleted file mode 100644
+index d52a538..0000000
+--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDepthToSpace.h
++++ /dev/null
+@@ -1,68 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2016-2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#ifndef __ARM_COMPUTE_CLDEPTHTOSPACE_H__
+-#define __ARM_COMPUTE_CLDEPTHTOSPACE_H__
+-
+-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+-
+-namespace arm_compute
+-{
+-class ICLTensor;
+-
+-/** Basic function to run @ref CLDepthToSpaceKernel
+- *
+- * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32.
+- * @note The function converts the input tensor to the tensor of the output tensor's type.
+- */
+-class CLDepthToSpace : public ICLSimpleFunction
+-{
+-public:
+-  /** Initialise the kernel's input and output.
+-   *
+-   * @param[in]  input              Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+-   * @param[out] output             Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+-   * @param[block_size] block size  integer only
+-   */
+-  void configure(ICLTensor *input, ICLTensor *output, const int32_t block_size);
+-};
+-} // namesace arm_compute
+-
+-#endif /* __ARM_COMPUTE_CLDEPTHTOSPACE_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h
+new file mode 100644
+index 0000000..409eaf5
+--- /dev/null
++++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h
+@@ -0,0 +1,201 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *      http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++/*
++ * Copyright (c) 2019-2020 ARM Limited.
++ *
++ * SPDX-License-Identifier: MIT
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in all
++ * copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++#ifndef __ARM_COMPUTE_CLDIRECTTRANSPOSECONVLAYER_H__
++#define __ARM_COMPUTE_CLDIRECTTRANSPOSECONVLAYER_H__
++
++#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
++#include "arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h"
++#include "arm_compute/runtime/CL/functions/CLReverse.h"
++#include "arm_compute/runtime/CL/functions/CLTranspose.h"
++
++#include "arm_compute/runtime/CL/CLTensor.h"
++#include "arm_compute/runtime/IFunction.h"
++#include "arm_compute/runtime/IMemoryManager.h"
++#include "arm_compute/runtime/MemoryGroup.h"
++
++#include <memory>
++
++namespace arm_compute
++{
++class ICLTensor;
++/** Function to run the deconvolution layer.
++ *
++ * Deconvolution Layer is the backward pass of Convolution Layer. First we transform the input
++ * depending on the stride and pad info and then perform a 1x1
++ * convolution pass. Input stride defines how many zeroes we should put between each element of the
++ * input and pad is the amount of padding.
++ *
++ *  The relation between input to output is as follows:
++ *  \f[
++ *       width\_output = (width\_input - 1) \cdot stride\_x - 2 \cdot padding\_x + kernel\_x
++ *  \f]
++ *  \f[
++ *       height\_output = (height\_input - 1) \cdot stride\_y - 2 \cdot padding\_y + kernel\_y
++ *  \f]
++ *
++ *  where:
++ *      width_input is the size of the first input dimension.
++ *      height_input is the size of the second input dimension.
++ *      width_output is the size of the first output dimension.
++ *      height_output is the size of the second output dimension.
++ *      kernel_x and kernel_y are the convolution sizes in x and y.
++ *      stride_x and stride_y is the input stride of the first and second dimension.
++ *
++ * The weights used by Deconvolution are supposed to be the same as the ones used for Convolution.
++ * Therefore, it will be necessary to use the weights in the
++ * reverse order to perform an actual convolution. This is achieved by using @ref CLReverse.
++ *
++ * This function calls the following OpenCL kernels/functions:
++ *
++ * -# @ref CLDeconvolutionLayerUpsample
++ * -# @ref CLConvolutionLayer
++ *
++ * And the following CPP kernels:
++ * -# @ref CLReverse
++ *
++ */
++class CLDirectTransposeConvLayer : public IFunction
++{
++public:
++  /** Constructor */
++  CLDirectTransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
++  /** Prevent instances of this class from being copied (As this class contains pointers) */
++  CLDirectTransposeConvLayer(const CLDirectTransposeConvLayer &) = delete;
++  /** Default move constructor */
++  CLDirectTransposeConvLayer(CLDirectTransposeConvLayer &&) = default;
++  /** Prevent instances of this class from being copied (As this class contains pointers) */
++  CLDirectTransposeConvLayer &operator=(const CLDirectTransposeConvLayer &) = delete;
++  /** Default move assignment operator */
++  CLDirectTransposeConvLayer &operator=(CLDirectTransposeConvLayer &&) = default;
++  /** Set the input, weights, biases and output tensors.
++   *
++   * @param[in,out] input        Input tensor. 3 lower dimensions represent a single input, and an
++ * optional 4th dimension for batch of inputs.
++   *                             Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
++   * @param[in]     weights      The 4d weights with dimensions [width, height, IFM, OFM]. Data type
++ * supported: Same as @p input.
++   * @param[in]     bias         (Optional) The biases have one dimension.
++   *                             Data type supported: Should match @p input data type, except for
++ * input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type
++   * @param[out]    output       Output tensor. The output has the same number of dimensions as the
++ * @p input.
++   * @param[in]     info         Contains padding and policies to be used in the deconvolution, this
++ * is decribed in @ref PadStrideInfo.
++ * @param[in] invalid_right  The number of zeros added to right edge of the output.
++ * @param[in] invalid_bottom  The number of zeros added to bottom edge of the output.
++   * @param[in]     weights_info (Optional) Weights information needed for @ref CLConvolutionLayer,
++ * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel.
++   *
++   */
++  void configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output,
++                 const PadStrideInfo &info, unsigned int invalid_right, unsigned int invalid_bottom,
++                 const WeightsInfo &weights_info = WeightsInfo());
++  /** Set the input, weights, biases and output tensors.
++   *
++   * @param[in]     compile_context The compile context to be used.
++   * @param[in,out] input           Input tensor. 3 lower dimensions represent a single input, and
++ * an optional 4th dimension for batch of inputs.
++   *                                Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
++   * @param[in]     weights         The 4d weights with dimensions [width, height, IFM, OFM]. Data
++ * type supported: Same as @p input.
++   * @param[in]     bias            (Optional) The biases have one dimension.
++   *                                Data type supported: Should match @p input data type, except for
++ * input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type
++   * @param[out]    output          Output tensor. The output has the same number of dimensions as
++ * the @p input.
++   * @param[in]     info            Contains padding and policies to be used in the deconvolution,
++ * this is decribed in @ref PadStrideInfo.
++ * @param[in] invalid_right  The number of zeros added to right edge of the output.
++ * @param[in] invalid_bottom  The number of zeros added to bottom edge of the output.
++   * @param[in]     weights_info    (Optional) Weights information needed for @ref
++ * CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref
++ * CLWeightsReshapeKernel.
++   *
++   */
++  void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights,
++                 const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info,
++                 unsigned int invalid_right, unsigned int invalid_bottom,
++                 const WeightsInfo &weights_info = WeightsInfo());
++  /** Static function to check if given info will lead to a valid configuration of @ref
++ * CLDirectTransposeConvLayer
++   *
++   * @param[in] input        Input tensor info. 3 lower dimensions represent a single input, and an
++ * optional 4th dimension for batch of inputs.
++   *                         Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
++   * @param[in] weights      The 4d weights info with dimensions [width, height, IFM, OFM]. Data
++ * type supported: Same as @p input.
++   * @param[in] bias         (Optional) The biases have one dimension.
++   *                         Data type supported: Should match @p input data type, except for input
++ * of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type
++   * @param[in] output       Output tensor info. The output has the same number of dimensions as the
++ * @p input.
++   * @param[in] info         Contains padding and policies to be used in the deconvolution, this is
++ * decribed in @ref PadStrideInfo.
++ * @param[in] invalid_right  The number of zeros added to right edge of the output.
++ * @param[in] invalid_bottom  The number of zeros added to bottom edge of the output.
++   * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer,
++ * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel.
++   *
++   * @return a status
++   */
++  static Status validate(const ITensorInfo *input, const ITensorInfo *weights,
++                         const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &info,
++                         unsigned int invalid_right, unsigned int invalid_bottom,
++                         const WeightsInfo &weights_info = WeightsInfo());
++
++  // Inherited methods overridden:
++  void run() override;
++  void prepare() override;
++
++private:
++  MemoryGroup _memory_group;
++  CLDeconvolutionLayerUpsample _scale_f;
++  CLConvolutionLayer _conv_f;
++  CLReverse _flip_weights;
++
++  CLTensor _scaled_output;
++  ICLTensor *_original_weights;
++  CLTensor _weights_flipped;
++  CLTensor _flip_axis;
++
++  bool _is_prepared;
++};
++} // namespace arm_compute
++#endif /* __ARM_COMPUTE_CLDIRECTTRANSPOSECONVLAYER_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h
+index 1a0284a..f3266f6 100644
+--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h
++++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h
+@@ -50,7 +50,7 @@
+ #include "arm_compute/core/CL/kernels/CLTransposeKernel.h"
+ #include "arm_compute/runtime/MemoryGroup.h"
+ #include "arm_compute/runtime/CL/CLTensor.h"
+-#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.h"
++#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
+ 
+ namespace arm_compute
+ {
+@@ -168,7 +168,7 @@ private:
+   CLFullyConnectedHybridLayerReshapeWeights _reshape_weights_kernel;
+   CLScaleFactorSymm8Kernel _scale_factor_kernel;
+   CLQuantizationSymmetricKernel _quant_input_kernel;
+-  CLGEMMLowpMatrixMultiplyCoreEx _mm_gemmlowp;
++  CLGEMMLowpMatrixMultiplyCore _mm_gemmlowp;
+   CLMultiplyScaleFactorKernel _multiply_scale_kernel;
+   CLGEMMMatrixAccumulateBiasesKernel _accumulate_biases_kernel; // TODO(COMPMID-1889): Use CLGEMM to
+                                                                 // add bias in
+diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.h
+deleted file mode 100644
+index 68aba74..0000000
+--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.h
++++ /dev/null
+@@ -1,142 +0,0 @@
+-/*
+- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2017-2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#ifndef __ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCOREEX_H__
+-#define __ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCOREEX_H__
+-
+-#include "arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h"
+-#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.h"
+-#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
+-#include "arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h"
+-#include "arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
+-#include "arm_compute/runtime/CL/CLTensor.h"
+-#include "arm_compute/runtime/IFunction.h"
+-#include "arm_compute/runtime/MemoryGroup.h"
+-
+-namespace arm_compute
+-{
+-class IMemoryManager;
+-class ICLTensor;
+-
+-/** Basic function to execute GEMMLowpMatrixMultiplyCore on OpenCL. This function calls the
+- * following OpenCL kernels:
+- *
+- *  -# @ref CLGEMMLowpMatrixMultiplyKernel (if the parameter "reshape_b_only_on_first_run" of
+- * GEMMInfo is FALSE)
+- *  -# @ref CLGEMMLowpMatrixAReductionKernel (if the offset of matrix B is not 0)
+- *  -# @ref CLGEMMLowpMatrixBReductionKernel (if the offset of matrix A is not 0)
+- *
+-*/
+-class CLGEMMLowpMatrixMultiplyCoreEx : public IFunction
+-{
+-public:
+-  /** Constructor */
+-  CLGEMMLowpMatrixMultiplyCoreEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+-  CLGEMMLowpMatrixMultiplyCoreEx(const CLGEMMLowpMatrixMultiplyCoreEx &) = delete;
+-  /** Default move constructor */
+-  CLGEMMLowpMatrixMultiplyCoreEx(CLGEMMLowpMatrixMultiplyCoreEx &&) = default;
+-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+-  CLGEMMLowpMatrixMultiplyCoreEx &operator=(const CLGEMMLowpMatrixMultiplyCoreEx &) = delete;
+-  /** Default move assignment operator */
+-  CLGEMMLowpMatrixMultiplyCoreEx &operator=(CLGEMMLowpMatrixMultiplyCoreEx &&) = default;
+-  /** Initialise the kernel's inputs, output
+-   *
+-   * @note GEMMLowp:  low precision GEMM kernel. [A * B + C]
+-   *  This kernel performs the following computations:
+-   *
+-   *  -# Convert a values from QASYMM8 to int32 and add a_offset to each of them.
+-   *  -# Convert b values from QASYMM8 to int32 and add b_offset to each of them.
+-   *  -# Compute the matrix product of the resulting a * b in int32.
+-   *  -# Quantize to uint8 if gemm_info.gemmlowp_output_stage != NONE
+-   *
+-   * @param[in]  a         First input tensor  (Matrix A). Data type supported: QASYMM8.
+-   * @param[in]  b         Second input tensor (Matrix B). Data type supported: same as @p a
+-   * @param[in]  c         Third input tensor  (Matrix C). It can be a nullptr. Data type supported:
+-   * S32
+-   * @param[out] output    Output tensor. Data type supported: S32 or QASYMM8 if
+-   * gemm_info.gemmlowp_output_stage != NONE
+-   * @param[in]  gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped
+-   * and
+-   *                       if the reshape of matrix B should be executed only for the first run
+-   */
+-  void configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output,
+-                 const GEMMInfo &gemm_info = GEMMInfo());
+-  /** Static function to check if given info will lead to a valid configuration of @ref
+-   * CLGEMMLowpMatrixMultiplyCoreEx
+-   *
+-   * @param[in] a         First input tensor info (Matrix A). Data type supported: QASYMM8.
+-   * @param[in] b         Second input tensor info (Matrix B). Data type supported: same as @p a
+-   * @param[in] c         Third input tensor info (Matrix C). It can be a nullptr. Data type
+-   * supported: S32
+-   * @param[in] output    Output tensor info. Data type supported: S32 or QASYMM8 if
+-   * gemm_info.gemmlowp_output_stage != NONE
+-   * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped
+-   * and
+-   *                      if the reshape of matrix B should be executed only for the first run
+-   *
+-   * @return a status
+-   */
+-  static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c,
+-                         const ITensorInfo *output, const GEMMInfo &gemm_info = GEMMInfo());
+-
+-  // Inherited methods overridden:
+-  void run() override;
+-  void prepare() override;
+-
+-private:
+-  MemoryGroup _memory_group;
+-
+-  // Kernels used
+-  CLGEMMLowpMatrixMultiplyKernelEx _mm_midgard_kernel;
+-  CLGEMMLowpMatrixAReductionKernel _mtx_a_reduction_kernel;
+-  CLGEMMLowpMatrixBReductionKernel _mtx_b_reduction_kernel;
+-
+-  // Temporary tensors
+-  CLTensor _vector_sum_col;
+-  CLTensor _vector_sum_row;
+-
+-  int32_t _a_offset;
+-  int32_t _b_offset;
+-  bool _reshape_b_only_on_first_run;
+-  bool _is_prepared;
+-};
+-} // namespace arm_compute
+-#endif /*__ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCOREEX_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLLogicalNot.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLLogicalNot.h
+deleted file mode 100644
+index 5121671..0000000
+--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLLogicalNot.h
++++ /dev/null
+@@ -1,62 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2016-2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#ifndef __ARM_COMPUTE_CLLOGICALNOT_H__
+-#define __ARM_COMPUTE_CLLOGICALNOT_H__
+-
+-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+-
+-namespace arm_compute
+-{
+-class ICLTensor;
+-
+-class CLLogicalNot : public ICLSimpleFunction
+-{
+-public:
+-  /** Initialise the function's source and destination.
+-   *
+-   * @param[in]  input  Source tensor. Data types supported: QASYMM8.
+-   * @param[out] output Output tensor. Data types supported: QASYMM8.
+-   */
+-  void configure(ICLTensor *input, ICLTensor *output);
+-};
+-
+-} // namespace arm_compute
+-#endif /*__ARM_COMPUTE_CLLOGICALNOT_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPReLU.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPReLU.h
+deleted file mode 100644
+index 7fbe558..0000000
+--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPReLU.h
++++ /dev/null
+@@ -1,64 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2016-2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#ifndef __ARM_COMPUTE_CLPRELU_H__
+-#define __ARM_COMPUTE_CLPRELU_H__
+-
+-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+-
+-namespace arm_compute
+-{
+-class ICLTensor;
+-
+-class CLPReLU : public ICLSimpleFunction
+-{
+-public:
+-  /** Initialise the function's source and destination.
+-   *
+-   * @param[in]  input. Data types supported:
+-   * QASYMM8/F16/F32.
+-   * @param[in]  alpha. Data types supported:
+-   * QASYMM8/F16/F32.
+-   * @param[out] output Output tensor. Data types supported: Same as @p input.
+-   */
+-  void configure(ICLTensor *input, ICLTensor *alpha, ICLTensor *output);
+-};
+-} // namespace arm_compute
+-#endif /*__ARM_COMPUTE_CLPRELU_H__*/
+diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h
+deleted file mode 100644
+index e83fb01..0000000
+--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h
++++ /dev/null
+@@ -1,103 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2016-2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-/**
+- * @file CLPixelWiseDivision.h
+- * @ingroup COM_AI_RUNTIME
+- * @brief This file contains arm_compute::CLPixelWiseDivision class
+- */
+-#ifndef __ARM_COMPUTE_CLPIXELWISEDIVISION_H__
+-#define __ARM_COMPUTE_CLPIXELWISEDIVISION_H__
+-
+-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+-
+-namespace arm_compute
+-{
+-class ICLTensor;
+-
+-/**
+- * @brief Class to run @ref CLPixelWiseDivisionKernel.
+- */
+-class CLPixelWiseDivision : public ICLSimpleFunction
+-{
+-public:
+-  /**
+-   * @brief Initialise the kernel's inputs, output and convertion policy.
+-   * @param[in, out] input1          An input tensor. Data types supported: U8/S16/F16/F32
+-   *                                 The input tensor is [in, out] because its TensorInfo might be
+-   * modified inside the kernel in case of broadcasting of dimension 0.
+-   * @param[in, out] input2          An input tensor. Data types supported: same as @p input1.
+-   *                                 The input tensor is [in, out] because its TensorInfo might be
+-   * modified inside the kernel in case of broadcasting of dimension 0.
+-   * @param[out]     output          The output tensor, Data types supported: same as @p input1.
+-   * Note: U8 requires both inputs to be U8.
+-   * @param[in]      scale           Scale to apply after multiplication.
+-   *                                 Scale must be positive and its value must be either 1/255 or
+-   * 1/2^n where n is between 0 and 15.
+-   * @param[in]      overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
+-   * @param[in]      rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest
+-   * even.
+-   * @return N/A
+-   */
+-  void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale = 1.f,
+-                 ConvertPolicy overflow_policy = ConvertPolicy::WRAP,
+-                 RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO);
+-
+-  /**
+-   * @brief Static function to check if given info will lead to a valid configuration of @ref
+-   * CLPixelWiseDivision
+-   * @param[in] input1          An input tensor info. Data types supported: U8/S16/F16/F32
+-   * @param[in] input2          An input tensor info. Data types supported: same as @p input1.
+-   * @param[in] output          The output tensor info, Data types supported: same as @p input1.
+-   * Note: U8 requires both inputs to be U8.
+-   * @param[in] scale           Scale to apply after multiplication.
+-   *                            Scale must be positive and its value must be either 1/255 or 1/2^n
+-   * where n is between 0 and 15.
+-   * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
+-   * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
+-   * @return a status
+-   */
+-  static Status validate(const ITensorInfo *input1, const ITensorInfo *input2,
+-                         const ITensorInfo *output, float scale = 1.f,
+-                         ConvertPolicy overflow_policy = ConvertPolicy::WRAP,
+-                         RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO);
+-};
+-}
+-#endif /*__ARM_COMPUTE_CLPIXELWISEDIVISION_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLRNNLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLRNNLayerEx.h
+deleted file mode 100644
+index b49cbd8..0000000
+--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLRNNLayerEx.h
++++ /dev/null
+@@ -1,120 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#ifndef __ARM_COMPUTE_CLRNN_LAYER_EX_H__
+-#define __ARM_COMPUTE_CLRNN_LAYER_EX_H__
+-
+-#include "arm_compute/core/CL/kernels/CLActivationLayerKernel.h"
+-#include "arm_compute/core/CL/kernels/CLCopyKernel.h"
+-#include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h"
+-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+-#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
+-#include "arm_compute/runtime/CL/functions/CLGEMM.h"
+-
+-namespace arm_compute
+-{
+-class ICLTensor;
+-
+-/** Basic function to run @ref CLRNNLayerEx */
+-class CLRNNLayerEx : public IFunction
+-{
+-public:
+-  /** Default constructor */
+-  CLRNNLayerEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+-  /** Initialize the function
+-   *
+-   * @param[in]     input             Input is a 2-D tensor of shape [input_size, batch_size]. Data
+-   * types supported: F16/F32
+-   * @param[in]     weights           Weights tensor of shape [input_size, num_units] that
+-   * multiplies the input. Data types supported: Same as @p input
+-   * @param[in]     recurrent_weights Weights tensor of shape [num_units, num_units] that multiplies
+-   * the current 'state'. Data types supported: Same as @p input
+-   * @param[in]     bias              Bias vector of shape [num_units]. Data types supported: Same
+-   * as @p input
+-   * @param[out]    output            Output tensor of shape [num_units, batch_size]. Data types
+-   * supported: Same as @p input
+-   * @param[in,out] hidden_state      Output tensor of shape [num_units, batch_size]. Data types
+-   * supported: Same as @p input
+-   * @param[in]     info              Activation layer parameter.
+-   */
+-  void configure(const ICLTensor *input, const ICLTensor *weights,
+-                 const ICLTensor *recurrent_weights, const ICLTensor *bias, ICLTensor *hidden_state,
+-                 ICLTensor *output, ActivationLayerInfo &info);
+-  /** Initialize the function
+-   *
+-   * @param[in] input             Input is a 2-D tensor of shape [input_size, batch_size]. Data
+-   * types supported: F16/F32
+-   * @param[in] weights           Weights tensor of shape [input_size, num_units] that multiplies
+-   * the input. Data types supported: Same as @p input
+-   * @param[in] recurrent_weights Weights tensor of shape [num_units, num_units] that multiplies the
+-   * current 'state'. Data types supported: Same as @p input
+-   * @param[in] bias              Bias vector of shape [num_units]. Data types supported: Same as @p
+-   * input
+-   * @param[in] output            Output tensor of shape [num_units, batch_size]. Data types
+-   * supported: Same as @p input
+-   * @param[in] hidden_state      Output tensor of shape [num_units, batch_size]. Data types
+-   * supported: Same as @p input
+-   * @param[in] info              Activation layer parameter.
+-   *
+-   * @return a status
+-   */
+-  static Status validate(const ITensorInfo *input, const ITensorInfo *weights,
+-                         const ITensorInfo *recurrent_weights, const ITensorInfo *bias,
+-                         const ITensorInfo *hidden_state, const ITensorInfo *output,
+-                         const ActivationLayerInfo &info);
+-
+-  // Inherited methods overridden:
+-  void run() override;
+-  void prepare() override;
+-
+-private:
+-  MemoryGroup _memory_group;
+-  CLGEMM _gemm_state_f;
+-  CLSaturatedArithmeticOperationKernel _add_kernel;
+-  CLActivationLayerKernel _activation_kernel;
+-  CLFullyConnectedLayer _fully_connected_kernel;
+-  CLCopyKernel _copy_kernel;
+-  CLTensor _fully_connected_out;
+-  CLTensor _gemm_output;
+-  CLTensor _add_output;
+-  bool _is_prepared;
+-};
+-}
+-#endif /* __ARM_COMPUTE_CLRNN_LAYER_EX_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h
+deleted file mode 100644
+index 2090b46..0000000
+--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h
++++ /dev/null
+@@ -1,68 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2016-2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#ifndef __ARM_COMPUTE_CLSPACETODEPTH_H__
+-#define __ARM_COMPUTE_CLSPACETODEPTH_H__
+-
+-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+-
+-namespace arm_compute
+-{
+-class ICLTensor;
+-
+-/** Basic function to run @ref CLSpaceToDepthKernel
+- *
+- * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32.
+- * @note The function converts the input tensor to the tensor of the output tensor's type.
+- */
+-class CLSpaceToDepth : public ICLSimpleFunction
+-{
+-public:
+-  /** Initialise the kernel's input and output.
+-   *
+-   * @param[in]  input              Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+-   * @param[out] output             Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+-   * @param[block_size] block size  integer only
+-   */
+-  void configure(ICLTensor *input, ICLTensor *output, const int32_t block_size);
+-};
+-
+-} // namespace arm_compute
+-#endif /* __ARM_COMPUTE_CLSPACETODEPTH_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSliceEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSliceEx.h
+deleted file mode 100644
+index 03edd15..0000000
+--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSliceEx.h
++++ /dev/null
+@@ -1,81 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2017 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-/**
+- * @file CLStridedSlice.h
+- * @ingroup COM_AI_RUNTIME
+- * @brief This file contains arm_compute::CLStridedSlice and arm_compute::CLStridedSliceCPU class
+- */
+-
+-#ifndef __ARM_COMPUTE_CLSTRIDEDSLICEEX_H__
+-#define __ARM_COMPUTE_CLSTRIDEDSLICEEX_H__
+-
+-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+-
+-namespace arm_compute
+-{
+-class ICLTensor;
+-
+-/**
+- * @brief Class to run @ref CLStridedSliceKernel
+- */
+-class CLStridedSliceEx : public ICLSimpleFunction
+-{
+-public:
+-  /**
+-   * @brief Initialise the kernel's inputs and outputs
+-   * @param[in]  input   Tensor input. Data type supported:
+-   *                     U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+-   * @param[out] output  Output tensor. Data type supported: Same as @p input
+-   * @param[in]  beginData 'begin' vector of strided slice operation
+-   * @param[in]  endData   'end' vector of strided slice operation
+-   * @param[in]  stridesData 'strides' vector of strided slice operation
+-   * @param[in]  beginMask  If the ith bit is set, begin[i] is ignored
+-   * @param[in]  endMask    If the ith bit is set, end[i] is ignored
+-   * @param[in]  shrinkAxisMask  If the ith bit is set, the ith specification shrinks the
+-   *                             dimensionality by 1, taking on the value at index begin[i]
+-   * @return N/A
+-   */
+-  void configure(const ICLTensor *input, ICLTensor *output, ICLTensor *beginData,
+-                 ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask, int32_t endMask,
+-                 int32_t shrinkAxisMask);
+-};
+-}
+-#endif /*__ARM_COMPUTE_CLSTRIDEDSLICEEX_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h
+index 54a697e..5fb102e 100644
+--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h
++++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h
+@@ -15,7 +15,7 @@
+  */
+ 
+ /*
+- * Copyright (c) 2017-2018 ARM Limited.
++ * Copyright (c) 2017-2020 ARM Limited.
+  *
+  * SPDX-License-Identifier: MIT
+  *
+@@ -37,16 +37,11 @@
+  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+  * SOFTWARE.
+  */
+-
+ #ifndef __ARM_COMPUTE_CLTRANSPOSECONVLAYER_H__
+ #define __ARM_COMPUTE_CLTRANSPOSECONVLAYER_H__
+ 
+-#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
+-#include "arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h"
+-
+-#include "arm_compute/core/CPP/kernels/CPPFlipWeightsKernel.h"
+-
+-#include "arm_compute/runtime/CL/CLTensor.h"
++#include "arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h"
++#include "arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h"
+ #include "arm_compute/runtime/IFunction.h"
+ #include "arm_compute/runtime/IMemoryManager.h"
+ 
+@@ -54,119 +49,102 @@
+ 
+ namespace arm_compute
+ {
+-class ICLTensor;
+-/** Function to run the transpose convolution layer.
+- *
+- * @note This layer was copied in order to fix a bug computing to wrong output dimensions.
+- *
+- * TransposeConv Layer is the backward pass of Convolution Layer. First we transform the input
+- * depending on the stride and pad info and then perform a 1x1
+- * convolution pass. Input stride defines how many zeroes we should put between each element of the
+- * input, pad is the amount of padding and finally a is a user
+- * specified value where a < stride - 1, that increases the padding top and right of the input
+- * image.
+- *
+- *  The relation between input to output is as follows:
+- *  \f[
+- *       width\_output = (width\_input - 1) \cdot stride\_x - \cdot padding\_x + kernel\_x
+- *  \f]
+- *  \f[
+- *       height\_output = (height\_input - 1) \cdot stride\_y - \cdot padding\_y + kernel\_y
+- *  \f]
+- *
+- *  where:
+- *      width_input is the size of the first input dimension.
+- *      height_input is the size of the second input dimension.
+- *      width_output is the size of the first output dimension.
+- *      height_output is the size of the second output dimension.
+- *      kernel_x and kernel_y are the convolution sizes in x and y.
+- *      stride_x and stride_y is the input stride of the first and second dimension.
+- *
+- * The weights used by Deconvolution are supposed to be the same as the ones used for Convolution.
+- * Therefore, it will be necessary to use the weights in the
+- * reverse order to perform an actual convolution. This is achieved by using the @ref
+- * CPPFlipWeightsKernel.
+- *
+- * This function calls the following OpenCL kernels/functions:
+- *
+- * -# @ref CLTransposeConvLayerUpsample
+- * -# @ref CLConvolutionLayer
++/** Basic function to compute the deconvolution layer. This function calls the following OpenCL
++ * kernels/functions:
+  *
++ * -# @ref CLGEMMDeconvolutionLayer
++ * -# @ref CLDirectTransposeConvLayer
+  */
+ class CLTransposeConvLayer : public IFunction
+ {
+ public:
+-  /** Constructor */
++  /** Default constructor */
+   CLTransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+-  CLTransposeConvLayer(const CLTransposeConvLayer &) = delete;
+-  /** Default move constructor */
+-  CLTransposeConvLayer(CLTransposeConvLayer &&) = default;
+-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+-  CLTransposeConvLayer &operator=(const CLTransposeConvLayer &) = delete;
+-  /** Default move assignment operator */
+-  CLTransposeConvLayer &operator=(CLTransposeConvLayer &&) = default;
++
+   /** Set the input, weights, biases and output tensors.
+    *
+-   * @param[in,out] input          Input tensor. 3 lower dimensions represent a single input,
+-   *                               and an optional 4th dimension for batch of inputs.
+-   *                               Data types supported: QASYMM8/F16/F32.
+-   * @param[in]     weights        The 4d weights with dimensions [width, height, IFM, OFM].
+-   *                               Data type supported: Same as @p input.
+-   * @param[in]     bias           (Optional) The biases have one dimension. Data type supported:
+-   *                               Same as @p input.
+-   * @param[out]    output         Output tensor. The output has the same number of dimensions
+-   *                               as the @p input.
+-   * @param[in]     info           Contains padding and policies to be used in the
+-   *                               transpose convolution, this is decribed in @ref PadStrideInfo.
+-   * @param[in]     invalid_right  The number of zeros added to right edge of the output.
+-   * @param[in]     invalid_bottom The number of zeros added to top edge of the output.
+-   * @param[in]     weights_info   (Optional) Weights information needed for @ref
+-   *                               CLConvolutionLayer, specifies if the weights tensor has been
+-   *                               reshaped with @ref CLWeightsReshapeKernel.
++   * @param[in,out] input        Input tensor. 3 lower dimensions represent a single input, and an
++ * optional 4th dimension for batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
++   * @param[in]     weights      The 4d weights with dimensions [width, height, IFM, OFM]. Data type
++ * supported: Same as @p input.
++   * @param[in]     bias         (Optional) The biases have one dimension. Data type supported: Same
++ * as @p input.
++   * @param[out]    output       Output tensor. The output has the same number of dimensions as the
++ * @p input.
++   * @param[in]     deconv_info  Contains padding and policies to be used in the deconvolution, this
++ * is described in @ref PadStrideInfo.
++ * @param[in] invalid_right  The number of zeros added to right edge of the output.
++ * @param[in] invalid_bottom  The number of zeros added to bottom edge of the output.
++   * @param[in]     weights_info (Optional) Weights information needed for @ref CLConvolutionLayer,
++ * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel.
++   *
+    */
+   void configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output,
+-                 const PadStrideInfo &info, unsigned int invalid_right, unsigned int invalid_bottom,
++                 const PadStrideInfo &deconv_info, unsigned int invalid_right,
++                 unsigned int invalid_bottom, const WeightsInfo &weights_info = WeightsInfo());
++  /** Set the input, weights, biases and output tensors.
++   *
++   * @param[in]     compile_context The compile context to be used.
++   * @param[in,out] input           Input tensor. 3 lower dimensions represent a single input, and
++ * an optional 4th dimension for batch of inputs. Data types supported:
++ * QASYMM8_SIGNED/QASYMM8/F16/F32.
++   * @param[in]     weights         The 4d weights with dimensions [width, height, IFM, OFM]. Data
++ * type supported: Same as @p input.
++   * @param[in]     bias            (Optional) The biases have one dimension. Data type supported:
++ * Same as @p input.
++   * @param[out]    output          Output tensor. The output has the same number of dimensions as
++ * the @p input.
++   * @param[in]     deconv_info     Contains padding and policies to be used in the deconvolution,
++ * this is described in @ref PadStrideInfo.
++ * @param[in] invalid_right  The number of zeros added to right edge of the output.
++ * @param[in] invalid_bottom  The number of zeros added to bottom edge of the output.
++   * @param[in]     weights_info    (Optional) Weights information needed for @ref
++ * CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref
++ * CLWeightsReshapeKernel.
++   *
++   */
++  void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights,
++                 const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info,
++                 unsigned int invalid_right, unsigned int invalid_bottom,
+                  const WeightsInfo &weights_info = WeightsInfo());
+   /** Static function to check if given info will lead to a valid configuration of @ref
+-   * CLTransposeConvLayer
++ * CLTransposeConvLayer
++   *
++   * @param[in] input        Input tensor info. 3 lower dimensions represent a single input, and an
++ * optional 4th dimension for batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
++   * @param[in] weights      The 4d weights info with dimensions [width, height, IFM, OFM]. Data
++ * type supported: Same as @p input.
++   * @param[in] bias         (Optional) The biases have one dimension. Data type supported: Same as
++ * @p input.
++   * @param[in] output       Output tensor info. The output has the same number of dimensions as the
++ * @p input.
++   * @param[in] deconv_info  Contains padding and policies to be used in the deconvolution, this is
++ * described in @ref PadStrideInfo.
++ * @param[in] invalid_right  The number of zeros added to right edge of the output.
++ * @param[in] invalid_bottom  The number of zeros added to bottom edge of the output.
++   * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer,
++ * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel.
+    *
+-   * @param[in] input           Input tensor info. 3 lower dimensions represent a single input,
+-   *                            and an optional 4th dimension for batch of inputs.
+-   *                            Data types supported: QASYMM8/F16/F32.
+-   * @param[in] weights         The 4d weights info with dimensions [width, height, IFM, OFM].
+-   *                            Data type supported: Same as @p input.
+-   * @param[in] bias            (Optional) The biases have one dimension. Data type supported:
+-   *                            Same as @p input.
+-   * @param[in] output          Output tensor info. The output has the same number of dimensions
+-   *                            as the @p input.
+-   * @param[in] info            Contains padding and policies to be used in the
+-   *                            transpose convolution, this is decribed in @ref PadStrideInfo.
+-   * @param[in] innvalid_right  The number of zeros added to right edge of the output.
+-   * @param[in] invalid_bottom  The number of zeros added to top edge of the output.
+-   * @param[in] weights_info    (Optional) Weights information needed for @ref CLConvolutionLayer,
+-   *                            specifies if the weights tensor has been reshaped with @ref
+-   *                            CLWeightsReshapeKernel.
+    * @return a status
+    */
+   static Status validate(const ITensorInfo *input, const ITensorInfo *weights,
+-                         const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &info,
+-                         unsigned int innvalid_right, unsigned int invalid_bottom,
++                         const ITensorInfo *bias, ITensorInfo *output,
++                         const PadStrideInfo &deconv_info, unsigned int invalid_right,
++                         unsigned int invalid_bottom,
+                          const WeightsInfo &weights_info = WeightsInfo());
+ 
++  static DeconvolutionMethod
++  get_deconvolution_method(const ITensorInfo *input, const ITensorInfo *weights,
++                           const ITensorInfo *bias, ITensorInfo *output,
++                           const PadStrideInfo &deconv_info, unsigned int invalid_right,
++                           unsigned int invalid_bottom, const WeightsInfo &weights_info);
+   // Inherited methods overridden:
+   void run() override;
+   void prepare() override;
+ 
+ private:
+-  MemoryGroup _memory_group;
+-  CLTransposeConvLayerUpsample _scale_f;
+-  CLConvolutionLayer _conv_f;
+-  CPPFlipWeightsKernel _flip_weights;
+-  CLTensor _scaled_output;
+-  ICLTensor *_original_weights;
+-  CLTensor _weights_flipped;
+-  bool _is_prepared;
++  std::shared_ptr<IMemoryManager> _memory_manager;
++  std::unique_ptr<IFunction> _function;
+ };
+-}
++} // namespace arm_compute
+ #endif /* __ARM_COMPUTE_CLTRANSPOSECONVLAYER_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h
+deleted file mode 100644
+index 7570fe7..0000000
+--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h
++++ /dev/null
+@@ -1,102 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2017-2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#ifndef __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLE_H__
+-#define __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLE_H__
+-
+-#include "arm_compute/runtime/IFunction.h"
+-
+-#include "arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h"
+-#include "arm_compute/core/Types.h"
+-#include "arm_compute/runtime/IFunction.h"
+-#include "arm_compute/runtime/IMemoryManager.h"
+-
+-namespace arm_compute
+-{
+-class ICLTensor;
+-
+-/** Basic function to run @ref CLTransposeConvLayerUpsampleKernel */
+-class CLTransposeConvLayerUpsample : public IFunction
+-{
+-public:
+-  /** Default constructor */
+-  CLTransposeConvLayerUpsample();
+-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+-  CLTransposeConvLayerUpsample(const CLTransposeConvLayerUpsample &) = delete;
+-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+-  CLTransposeConvLayerUpsample &operator=(const CLTransposeConvLayerUpsample &) = delete;
+-  /** Allow instances of this class to be moved */
+-  CLTransposeConvLayerUpsample(CLTransposeConvLayerUpsample &&) = default;
+-  /** Allow instances of this class to be moved */
+-  CLTransposeConvLayerUpsample &operator=(CLTransposeConvLayerUpsample &&) = default;
+-  /** Default destructor */
+-  virtual ~CLTransposeConvLayerUpsample() = default;
+-
+-  /** Initialize the function's source, destination, interpolation type and border_mode.
+-   *
+-   * @param[in, out] input        Source tensor. Data type supported: QASYMM8/F16/F32.
+-   * @param[out]     output       Destination tensor. Data type supported: same as @p input.
+-   * @param[in]      inner_border The number of zeros added to right and top edges of the input.
+-   * @param[in]      info         Contains padding and policies to be used in the deconvolution.
+-   */
+-  void configure(ICLTensor *input, ICLTensor *output, const BorderSize &inner_border,
+-                 const PadStrideInfo &info);
+-  /** Static function to check if given info will lead to a valid configuration of @ref
+-   * CLTransposeConvLayerUpsample
+-   *
+-   * @param[in] input        Source tensor info. Data type supported: QASYMM8/F16/F32.
+-   * @param[in] output       Destination tensor info. Data type supported: same as @p input.
+-   * @param[in] inner_border The number of zeros added to right and top edges of the input.
+-   * @param[in] info         Contains padding and policies to be used in the deconvolution.
+-   *
+-   * @return a status
+-   */
+-  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+-                         const BorderSize &inner_border, const PadStrideInfo &info);
+-
+-  // Inherited methods overridden:
+-  void run() override;
+-
+-private:
+-  CLTransposeConvLayerUpsampleKernel _upsample;
+-  ICLTensor *_output;
+-};
+-}
+-#endif /* __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLE_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/runtime/CPP/functions/CPPUpsampleEx.h b/compute/ARMComputeEx/arm_compute/runtime/CPP/functions/CPPUpsampleEx.h
+deleted file mode 100644
+index 666afef..0000000
+--- a/compute/ARMComputeEx/arm_compute/runtime/CPP/functions/CPPUpsampleEx.h
++++ /dev/null
+@@ -1,65 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2017-2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#ifndef __ARM_COMPUTE_CPPUPSAMPLE_EX_H__
+-#define __ARM_COMPUTE_CPPUPSAMPLE_EX_H__
+-
+-#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
+-
+-#include "arm_compute/core/Types.h"
+-
+-namespace arm_compute
+-{
+-class ITensor;
+-
+-/** Basic function to run @ref CPPUpsample */
+-class CPPUpsampleEx : public ICPPSimpleFunction
+-{
+-public:
+-  /** Configure the upsample CPP kernel
+-   *
+-   * @param[in]  input  The input tensor to upsample. Data types supported: F32/F16/QASYMM8
+-   * @param[out] output The output tensor. Data types supported: Same as @p input
+-   * @param[in]  info   Padding information
+-   */
+-  void configure(const ITensor *input, ITensor *output, const PadStrideInfo &info);
+-};
+-}
+-#endif /* __ARM_COMPUTE_CPPUPSAMPLE_EX_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h
+index 49504fd..3fad230 100644
+--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h
++++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h
+@@ -18,20 +18,13 @@
+ 
+ #include <arm_compute/runtime/NEON/functions/NEActivationLayerEx.h>
+ #include <arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h>
+-#include <arm_compute/runtime/NEON/functions/NECast.h>
+-#include <arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h>
+ #include <arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h>
+ #include <arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h>
+ #include <arm_compute/runtime/NEON/functions/NEGatherEx.h>
+ #include <arm_compute/runtime/NEON/functions/NEHashtableLookup.h>
+ #include <arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h>
+-#include <arm_compute/runtime/NEON/functions/NEPReLU.h>
+-#include <arm_compute/runtime/NEON/functions/NEReduceMeanEx.h>
+ #include <arm_compute/runtime/NEON/functions/NEReduceSum.h>
+-#include <arm_compute/runtime/NEON/functions/NERNNLayerEx.h>
+ #include <arm_compute/runtime/NEON/functions/NEReduceOperation.h>
+-#include <arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h>
+-#include <arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h>
+ #include <arm_compute/runtime/NEON/functions/NETransposeConvLayer.h>
+ 
+ #endif // __ARM_COMPUTE_NEFUNCTIONSEX_H__
+diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECast.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECast.h
+deleted file mode 100644
+index f0f0d81..0000000
+--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECast.h
++++ /dev/null
+@@ -1,79 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2017-2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#ifndef __ARM_COMPUTE_NECAST_H__
+-#define __ARM_COMPUTE_NECAST_H__
+-
+-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+-
+-#include "arm_compute/core/Types.h"
+-#include "arm_compute/core/TypesEx.h"
+-
+-namespace arm_compute
+-{
+-// Forward declarations
+-class ITensor;
+-
+-/** Basic function to run @ref NECastKernel that converts an input tensor to the other types */
+-class NECast : public INESimpleFunctionNoBorder
+-{
+-public:
+-  /** Configure the kernel.
+-   *
+-   * @param[in]  input  Source tensor. Data types supported: U8/S8/QASYMM8/U32/S32/F32.
+-   * @param[out] output Destination tensor with the same dimensions of input. Data type supported:
+-   * U8/S8/QASYMM8/U32/S32/F32.
+-   * @param[in]  input_subtype  Sub data type of input.
+-   */
+-  void configure(const ITensor *input, ITensor *output,
+-                 SubDataType input_subtype = SubDataType::NONE);
+-  /** Static function to check if given info will lead to a valid configuration of @ref NECast
+-   *
+-   * @param[in] input  Input tensor info. Data types supported: U8/S8/QASYMM8/U32/S32/F32.
+-   * @param[in] output Output tensor info. Data type supported: U8/S8/QASYMM8/U32/S32/F32.
+-   * @param[in] input_subtype  Sub data type of input.
+-   *
+-   * @return a status
+-   */
+-  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+-                         SubDataType input_subtype = SubDataType::NONE);
+-};
+-} // namespace arm_compute
+-#endif /* __ARM_COMPUTE_NECAST_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h
+deleted file mode 100644
+index 005d85a..0000000
+--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h
++++ /dev/null
+@@ -1,78 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#ifndef __ARM_COMPUTE_NEDEPTHTOSPACELAYEREX_H__
+-#define __ARM_COMPUTE_NEDEPTHTOSPACELAYEREX_H__
+-
+-#include "arm_compute/runtime/IFunction.h"
+-
+-#include "arm_compute/core/Types.h"
+-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+-
+-namespace arm_compute
+-{
+-class ITensor;
+-
+-/** Basic function to run @ref NEDepthToSpaceLayerKernelEx. */
+-class NEDepthToSpaceLayerEx : public INESimpleFunctionNoBorder
+-{
+-public:
+-  /** Set the input and output tensors.
+-   *
+-   * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported:
+-   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+-   * @param[out] output      Tensor output. Data types supported: same as @p input
+-   * @param[in]  block_shape Block shape value.
+-   */
+-  void configure(const ITensor *input, ITensor *output, int32_t block_shape);
+-  /** Static function to check if given info will lead to a valid configuration of @ref
+-   * NEDepthToSpaceLayerEx.
+-   *
+-   * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported:
+-   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+-   * @param[in] output      Tensor output info. Data types supported: same as @p input
+-   * @param[in] block_shape Block shape x value.
+-   *
+-   * @return a status
+-   */
+-  static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
+-};
+-} // namespace arm_compute
+-#endif /* __ARM_COMPUTE_NEDEPTHTOSPACELAYEREX_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayerEx.h
+deleted file mode 100644
+index 27a38e9..0000000
+--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayerEx.h
++++ /dev/null
+@@ -1,70 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2018-2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#ifndef __ARM_COMPUTE_NEELEMENTWISEUNARYLAYEREX_H__
+-#define __ARM_COMPUTE_NEELEMENTWISEUNARYLAYEREX_H__
+-
+-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+-
+-namespace arm_compute
+-{
+-class ITensor;
+-
+-/** Basic function to perform negative on an input tensor. */
+-class NENegLayer : public INESimpleFunction
+-{
+-public:
+-  /** Initialize the function
+-   *
+-   * @param[in]  input  Input tensor. Data types supported: F16/F32/S32.
+-   * @param[out] output Output tensor. Data types supported: same as @p input.
+-   */
+-  void configure(const ITensor *input, ITensor *output);
+-  /** Static function to check if given info will lead to a valid configuration of @ref NERsqrtLayer
+-   *
+-   * @param[in] input  First tensor input info. Data types supported: F16/F32/S32.
+-   * @param[in] output Output tensor info. Data types supported: Same as @p input.
+-   *
+-   * @return a status
+-   */
+-  static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+-};
+-} // namespace arm_compute
+-#endif /* __ARM_COMPUTE_NEELEMENTWISEUNARYLAYEREX_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h
+index 39c57eb..56548a4 100644
+--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h
++++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h
+@@ -46,7 +46,7 @@
+ #include "arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h"
+ #include "arm_compute/core/NEON/kernels/NETransposeKernel.h"
+ #include "arm_compute/runtime/MemoryGroup.h"
+-#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h"
++#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
+ #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+ #include "arm_compute/runtime/Tensor.h"
+ 
+@@ -164,7 +164,7 @@ private:
+   MemoryGroup _memory_group;
+   NEFullyConnectedHybridLayerReshapeWeights _reshape_weights_function;
+   NEQuantizationSymmetricKernel _quant_input_kernel;
+-  NEGEMMLowpMatrixMultiplyCoreEx _mm_gemmlowp;
++  NEGEMMLowpMatrixMultiplyCore _mm_gemmlowp;
+   NEMultiplyScaleFactorKernel _multiply_scale_kernel;
+   NEGEMMMatrixAccumulateBiasesKernel _accumulate_biases_kernel;
+   Tensor _reshape_weights_output;
+diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h
+deleted file mode 100644
+index d844513..0000000
+--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h
++++ /dev/null
+@@ -1,170 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2017-2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#ifndef __ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCOREEX_H__
+-#define __ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCOREEX_H__
+-
+-#include "arm_compute/core/NEON/INEKernel.h"
+-#include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
+-#include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
+-#include "arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
+-#include "arm_compute/runtime/IFunction.h"
+-#include "arm_compute/runtime/IMemoryManager.h"
+-#include "arm_compute/runtime/MemoryGroup.h"
+-// #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
+-#include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
+-#include "arm_compute/runtime/Tensor.h"
+-
+-#include <memory>
+-
+-namespace arm_compute
+-{
+-class ITensor;
+-
+-/** Basic function to execute GEMMLowpMatrixMultiplyCore on NEON. This function calls the following
+- * NEON kernels if the DOT product instruction is not available:
+- *
+- *  -# @ref NEGEMMInterleave4x4Kernel
+- *  -# @ref NEGEMMTranspose1xWKernel
+- *  -# @ref NEGEMMLowpMatrixMultiplyKernel
+- *  -# @ref NEGEMMLowpOffsetContributionKernel
+- *  -# @ref NEActivationLayer
+- *
+- * otherwise if the DOT product instruction is available:
+- *
+- *  -# @ref NEGEMMLowpOffsetContributionKernel
+- *
+-*/
+-class NEGEMMLowpMatrixMultiplyCoreEx : public IFunction
+-{
+-public:
+-  /** Constructor */
+-  NEGEMMLowpMatrixMultiplyCoreEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+-  NEGEMMLowpMatrixMultiplyCoreEx(const NEGEMMLowpMatrixMultiplyCoreEx &) = delete;
+-  /** Default move constructor */
+-  NEGEMMLowpMatrixMultiplyCoreEx(NEGEMMLowpMatrixMultiplyCoreEx &&) = default;
+-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+-  NEGEMMLowpMatrixMultiplyCoreEx &operator=(const NEGEMMLowpMatrixMultiplyCoreEx &) = delete;
+-  /** Default move assignment operator */
+-  NEGEMMLowpMatrixMultiplyCoreEx &operator=(NEGEMMLowpMatrixMultiplyCoreEx &&) = default;
+-  /** Initialise the kernel's inputs, output
+-   *
+-   * @note GEMM_LOWP:  low precision GEMM kernel
+-   *  This kernel performs the following computations:
+-   *
+-   *  -# Convert a values from QASYMM8 to int32 and add a_offset to each of them.
+-   *  -# Convert b values from QASYMM8 to int32 add b_offset to each of them.
+-   *  -# Compute the matrix product of the resulting a * b in int32.
+-   *
+-   * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is
+-   * QASYMM8/QASYMM8_SIGNED otherwise
+-   *
+-   * @param[in]  a         First input tensor  (Matrix A). Data type supported:
+-   * QASYMM8/QASYMM8_SIGNED.
+-   * @param[in]  b         Second input tensor (Matrix B). Data type supported: same as @p a
+-   * @param[in]  c         Third input tensor  (Matrix C). It can be a nullptr. Data type supported:
+-   * S32
+-   * @param[out] output    Output tensor. Data type supported: Data type supported:
+-   * S32/QASYMM8/QASYMM8_SIGNED
+-   * @param[in]  gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped
+-   * and
+-   *                       if the reshape of matrix B should be executed only for the first run
+-   */
+-  void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output,
+-                 const GEMMInfo &gemm_info = GEMMInfo());
+-  /** Static function to check if given info will lead to a valid configuration of @ref
+-   * NEGEMMLowpMatrixMultiplyCoreEx
+-   *
+-   * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is
+-   * QASYMM8/QASYMM8_SIGNED otherwise
+-   *
+-   * @param[in] a         First input tensor info  (Matrix A). Data type supported:
+-   * QASYMM8/QASYMM8_SIGNED.
+-   * @param[in] b         Second input tensor info (Matrix B). Data type supported: same as @p a
+-   * @param[in] c         Third input tensor  info (Matrix C). It can be a nullptr. Data type
+-   * supported: S32
+-   * @param[in] output    Output tensor info. Data type supported: Data type supported:
+-   * S32/QASYMM8/QASYMM8_SIGNED
+-   * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped
+-   * and
+-   *                      if the reshape of matrix B should be executed only for the first run
+-   *
+-   * @return a status
+-   */
+-  static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c,
+-                         const ITensorInfo *output, const GEMMInfo &gemm_info = GEMMInfo());
+-
+-  // Inherited methods overridden
+-  void run() override;
+-  void prepare() override;
+-
+-private:
+-  MemoryGroup _memory_group;
+-  NEGEMMAssemblyDispatch _asm_glue;
+-  std::unique_ptr<INEKernel> _mm_kernel;
+-  std::unique_ptr<INEKernel> _mtx_a_reshape_kernel;
+-  std::unique_ptr<INEKernel> _mtx_b_reshape_kernel;
+-  NEGEMMLowpMatrixAReductionKernel _mtx_a_reduction_kernel;
+-  NEGEMMLowpMatrixBReductionKernel _mtx_b_reduction_kernel;
+-  NEGEMMLowpOffsetContributionKernel _offset_contribution_kernel;
+-  NEGEMMLowpOffsetContributionOutputStageKernel _offset_contribution_output_stage_kernel;
+-
+-  Tensor _vector_sum_col;
+-  Tensor _vector_sum_row;
+-  Tensor _tmp_a;
+-  Tensor _tmp_b;
+-  Tensor _mm_result_s32;
+-  Tensor _signed_a;
+-  Tensor _signed_output;
+-  const ITensor *_original_b;
+-  int32_t _a_offset;
+-  int32_t _b_offset;
+-
+-  bool _run_vector_matrix_multiplication;
+-  bool _assembly_path;
+-  bool _fused_assembly_path;
+-  bool _reshape_b_only_on_first_run;
+-  bool _is_prepared;
+-  bool _fuse_output_stage;
+-  bool _flip_signedness;
+-};
+-} // namespace arm_compute
+-#endif /*__ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCOREEX_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEPReLU.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEPReLU.h
+deleted file mode 100644
+index ca84133..0000000
+--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEPReLU.h
++++ /dev/null
+@@ -1,63 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2018-2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#ifndef __ARM_COMPUTE_NEPRELU_H__
+-#define __ARM_COMPUTE_NEPRELU_H__
+-
+-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+-
+-namespace arm_compute
+-{
+-class ITensor;
+-
+-/** Basic function to run @ref NEPReLUKernel */
+-class NEPReLU : public INESimpleFunctionNoBorder
+-{
+-public:
+-  /** Initialise the kernel's inputs and output
+-   *
+-   * @param[in]  input. Data types supported: QASYMM8/F32.
+-   * @param[in]  alpha. Data types supported: Same as @p input.
+-   * @param[out] output Output tensor. Data types supported: Same as @p input.
+-   */
+-  void configure(const ITensor *input, const ITensor *alpha, ITensor *output);
+-};
+-} // namespace arm_compute
+-#endif /* __ARM_COMPUTE_NEPRELU_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NERNNLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NERNNLayerEx.h
+deleted file mode 100644
+index 8a7b179..0000000
+--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NERNNLayerEx.h
++++ /dev/null
+@@ -1,130 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#ifndef __ARM_COMPUTE_NERNNLAYER_EX_H__
+-#define __ARM_COMPUTE_NERNNLAYER_EX_H__
+-
+-#include "arm_compute/core/NEON/kernels/NEActivationLayerKernel.h"
+-#include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h"
+-#include "arm_compute/core/NEON/kernels/NECopyKernel.h"
+-
+-#include "arm_compute/core/Types.h"
+-#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
+-#include "arm_compute/runtime/NEON/functions/NEGEMM.h"
+-
+-namespace arm_compute
+-{
+-// Forward declarations
+-class ITensor;
+-
+-/** Basic function to run @ref NERNNLayerEx */
+-class NERNNLayerEx : public IFunction
+-{
+-public:
+-  /** Default constructor */
+-  NERNNLayerEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+-  NERNNLayerEx(const NERNNLayerEx &) = delete;
+-  /** Default move constructor */
+-  NERNNLayerEx(NERNNLayerEx &&) = default;
+-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+-  NERNNLayerEx &operator=(const NERNNLayerEx &) = delete;
+-  /** Default move assignment operator */
+-  NERNNLayerEx &operator=(NERNNLayerEx &&) = default;
+-  /** Initialize the function
+-   *
+-   * @param[in]     input             Input is a 2-D tensor of shape [input_size, batch_size]. Data
+-   * types supported: F16/F32
+-   * @param[in]     weights           Weights tensor of shape [input_size, num_units] that
+-   * multiplies the input. Data types supported: Same as @p input
+-   * @param[in]     recurrent_weights Weights tensor of shape [num_units, num_units] that multiplies
+-   * the current 'state'. Data types supported: Same as @p input
+-   * @param[in]     bias              Bias vector of shape [num_units]. Data types supported: Same
+-   * as @p input
+-   * @param[out]    output            Output tensor of shape [num_units, batch_size]. Data types
+-   * supported: Same as @p input
+-   * @param[in,out] hidden_state      Output tensor of shape [num_units, batch_size]. Data types
+-   * supported: Same as @p input
+-   * @param[in]     info              Activation layer parameter.
+-   */
+-  void configure(const ITensor *input, const ITensor *weights, const ITensor *recurrent_weights,
+-                 const ITensor *bias, ITensor *hidden_state, ITensor *output,
+-                 ActivationLayerInfo &info);
+-  /** Initialize the function
+-   *
+-   * @param[in] input             Input is a 2-D tensor of shape [input_size, batch_size]. Data
+-   * types supported: F16/F32
+-   * @param[in] weights           Weights tensor of shape [input_size, num_units] that multiplies
+-   * the input. Data types supported: Same as @p input
+-   * @param[in] recurrent_weights Weights tensor of shape [num_units, num_units] that multiplies the
+-   * current 'state'. Data types supported: Same as @p input
+-   * @param[in] bias              Bias vector of shape [num_units]. Data types supported: Same as @p
+-   * input
+-   * @param[in] output            Output tensor of shape [num_units, batch_size]. Data types
+-   * supported: Same as @p input
+-   * @param[in] hidden_state      Output tensor of shape [num_units, batch_size]. Data types
+-   * supported: Same as @p input
+-   * @param[in] info              Activation layer parameter.
+-   *
+-   * @return a status
+-   */
+-  static Status validate(const ITensorInfo *input, const ITensorInfo *weights,
+-                         const ITensorInfo *recurrent_weights, const ITensorInfo *bias,
+-                         const ITensorInfo *hidden_state, const ITensorInfo *output,
+-                         const ActivationLayerInfo &info);
+-
+-  // Inherited methods overridden:
+-  void run() override;
+-  void prepare() override;
+-
+-private:
+-  MemoryGroup _memory_group;
+-  NEGEMM _gemm_state_f;
+-  NEArithmeticAdditionKernel _add_kernel;
+-  NEActivationLayerKernel _activation_kernel;
+-  NEFullyConnectedLayer _fully_connected_kernel;
+-  NECopyKernel _copy_kernel;
+-  Tensor _fully_connected_out;
+-  Tensor _gemm_output;
+-  Tensor _add_output;
+-  bool _is_prepared;
+-};
+-} // namespace arm_compute
+-#endif /* __ARM_COMPUTE_NERNNLAYER_EX_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceMeanEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceMeanEx.h
+deleted file mode 100644
+index 03ac457..0000000
+--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceMeanEx.h
++++ /dev/null
+@@ -1,99 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#ifndef __ARM_COMPUTE_NEON_REDUCE_MEAN_EX_H__
+-#define __ARM_COMPUTE_NEON_REDUCE_MEAN_EX_H__
+-
+-#include "arm_compute/runtime/IFunction.h"
+-
+-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+-#include "arm_compute/core/Types.h"
+-#include "arm_compute/runtime/MemoryGroup.h"
+-#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
+-#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
+-
+-namespace arm_compute
+-{
+-class ITensor;
+-
+-/** Basic function to perform reduce operation */
+-class NEReduceMeanEx : public IFunction
+-{
+-public:
+-  /** Constructor */
+-  NEReduceMeanEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+-  /** Configure kernel
+-   *
+-   * @note Supported tensor rank: up to 4
+-   *
+-   * @param[in]  input          Source tensor. Data type supported: QASYMM8/F16/F32
+-   * @param[in]  reduction_axis Reduction axis vector.
+-   * @param[in]  keep_dims      If positive, retains reduced dimensions with length 1.
+-   * @param[out] output         Destination tensor. Data type supported: Same as @p input
+-   */
+-  void configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims,
+-                 ITensor *output);
+-
+-  /** Static function to check if given info will lead to a valid configuration of @ref
+-   * NEReduceMeanEx
+-   *
+-   * @param[in] input          Source tensor. Data type supported: QASYMM8/F16/F32
+-   * @param[in] reduction_axis Reduction axis vector.
+-   * @param[in] keep_dims      If positive, retains reduced dimensions with length 1.
+-   * @param[in] output         Destination tensor. Data type supported: Same as @p input
+-   *
+-   * @return A status
+-   */
+-  static Status validate(const ITensorInfo *input, const Coordinates &reduction_axis,
+-                         bool keep_dims, const ITensorInfo *output);
+-
+-  // Inherited methods overridden:
+-  void run() override;
+-
+-private:
+-  MemoryGroup _memory_group;
+-  std::unique_ptr<NEReductionOperation[]> _reduction_kernels{nullptr};
+-  std::unique_ptr<Tensor[]> _reduced_outs{nullptr};
+-  NEReshapeLayer _reshape;
+-  unsigned int _reduction_ops;
+-  bool _keep_dims;
+-};
+-} // namespace arm_compute
+-#endif /* __ARM_COMPUTE_NEON_REDUCE_MEAN_EX_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h
+deleted file mode 100644
+index 3b695fb..0000000
+--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h
++++ /dev/null
+@@ -1,136 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#ifndef __ARM_COMPUTE_NESPACETOBATCHLAYEREX_H__
+-#define __ARM_COMPUTE_NESPACETOBATCHLAYEREX_H__
+-
+-#include "arm_compute/runtime/IFunction.h"
+-
+-#include "arm_compute/core/NEON/kernels/NEMemsetKernel.h"
+-#include "arm_compute/core/NEON/kernels/NESpaceToBatchLayerKernel.h"
+-#include "arm_compute/core/Types.h"
+-
+-namespace arm_compute
+-{
+-class ITensor;
+-
+-/** Basic function to spatial divide a tensor. This function calls the following NEON
+- * kernels/functions:
+- *
+- *  -# @ref NEMemsetKernel
+- *  -# @ref NESpaceToBatchLayerKernel
+- */
+-class NESpaceToBatchLayerEx : public IFunction
+-{
+-public:
+-  /** Default constructor */
+-  NESpaceToBatchLayerEx();
+-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+-  NESpaceToBatchLayerEx(const NESpaceToBatchLayerEx &) = delete;
+-  /** Prevent instances of this class from being copied (As this class contains pointers) */
+-  NESpaceToBatchLayerEx &operator=(const NESpaceToBatchLayerEx &) = delete;
+-  /** Allow instances of this class to be moved */
+-  NESpaceToBatchLayerEx(NESpaceToBatchLayerEx &&) = default;
+-  /** Allow instances of this class to be moved */
+-  NESpaceToBatchLayerEx &operator=(NESpaceToBatchLayerEx &&) = default;
+-  /** Default destructor */
+-  virtual ~NESpaceToBatchLayerEx() = default;
+-  /** Set the input and output tensors.
+-   *
+-   * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported:
+-   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+-   * @param[in]  block_shape 1-D tensor with shape [M]. Data types supported: S32
+-   * @param[in]  paddings    2-D tensor with shape [2, M]. Data types supported: S32
+-   * @param[out] output      Tensor output. Data types supported: same as @p input
+-   */
+-  void configure(const ITensor *input, const ITensor *block_shape, const ITensor *paddings,
+-                 ITensor *output);
+-  /** Set the input and output tensors. (Static block shape and paddings)
+-   *
+-   * @param[in]  input         Tensor input. Supported tensor rank: 4. Data types supported:
+-   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+-   * @param[in]  block_shape_x Block shape x value.
+-   * @param[in]  block_shape_y Block shape y value.
+-   * @param[in]  padding_left  The left padding of the output tensor.
+-   * @param[in]  padding_right The right padding of the output tensor.
+-   * @param[out] output        Tensor output. Data types supported: same as @p input
+-   */
+-  void configure(const ITensor *input, const int block_shape_x, const int block_shape_y,
+-                 const Size2D &padding_left, const Size2D &padding_right, ITensor *output);
+-  /** Static function to check if given info will lead to a valid configuration of @ref
+-   * NESpaceToBatchLayerEx
+-   *
+-   * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported:
+-   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+-   * @param[in] block_shape block shape tensor info with shape [M]. Data types supported: S32
+-   * @param[in] paddings    paddings tensor info with shape [2, M]. Data types supported: S32
+-   * @param[in] output      Tensor output info. Data types supported: same as @p input
+-   *
+-   * @return a status
+-   */
+-  static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape,
+-                         const ITensorInfo *paddings, const ITensorInfo *output);
+-  /** Static function to check if given info will lead to a valid configuration of @ref
+-   * NESpaceToBatchLayerEx (Static block shape and paddings)
+-   *
+-   * @param[in] input         Tensor input info. Supported tensor rank: 4. Data types supported:
+-   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+-   * @param[in] block_shape_x Block shape x value.
+-   * @param[in] block_shape_y Block shape y value.
+-   * @param[in] padding_left  The left padding of the output tensor.
+-   * @param[in] padding_right The right padding of the output tensor.
+-   * @param[in] output        Tensor output info. Data types supported: same as @p input
+-   *
+-   * @return a status
+-   */
+-  static Status validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y,
+-                         const Size2D &padding_left, const Size2D &padding_right,
+-                         const ITensorInfo *output);
+-
+-  // Inherited methods overridden:
+-  void run() override;
+-
+-private:
+-  NESpaceToBatchLayerKernel _space_to_batch_kernel; /**< SpaceToBatch kernel to run */
+-  NEMemsetKernel _memset_kernel;                    /**< Memset kernel to run */
+-  bool _has_padding;                                /**< Flag to check if the output has padding */
+-};
+-} // namespace arm_compute
+-#endif /* __ARM_COMPUTE_NESPACETOBATCHLAYEREX_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h
+deleted file mode 100644
+index 9f32616..0000000
+--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h
++++ /dev/null
+@@ -1,79 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#ifndef __ARM_COMPUTE_NESPACETODEPTHLAYEREX_H__
+-#define __ARM_COMPUTE_NESPACETODEPTHLAYEREX_H__
+-
+-#include "arm_compute/core/Types.h"
+-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+-
+-namespace arm_compute
+-{
+-class ITensor;
+-
+-/** This function calls the following NEON kernels/functions:
+- *
+- *  -# @ref NESpaceToDepthLayerKernelEx
+- */
+-class NESpaceToDepthLayerEx : public INESimpleFunctionNoBorder
+-{
+-public:
+-  /** Set the input and output tensors.
+-   *
+-   * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported:
+-   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+-   * @param[out] output      Tensor output. Data types supported: same as @p input
+-   * @param[in]  block_shape Block shape value
+-   */
+-  void configure(const ITensor *input, ITensor *output, int32_t block_shape);
+-  /** Static function to check if given info will lead to a valid configuration of @ref
+-   * NESpaceToDepthLayerEx (Static block shape and paddings)
+-   *
+-   * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported:
+-   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+-   * @param[in] output      Tensor output info. Data types supported: same as @p input
+-   * @param[in] block_shape Block shape value
+-   *
+-   * @return a status
+-   */
+-  static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
+-};
+-} // namespace arm_compute
+-#endif /* __ARM_COMPUTE_NESPACETODEPTHLAYEREX_H__ */
+diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h
+index 408d150..24ff5da 100644
+--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h
++++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h
+@@ -15,7 +15,7 @@
+  */
+ 
+ /*
+- * Copyright (c) 2017-2019 ARM Limited.
++ * Copyright (c) 2017-2020 ARM Limited.
+  *
+  * SPDX-License-Identifier: MIT
+  *
+@@ -37,16 +37,14 @@
+  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+  * SOFTWARE.
+  */
+-
+ #ifndef __ARM_COMPUTE_NETRANSPOSECONVLAYER_H__
+ #define __ARM_COMPUTE_NETRANSPOSECONVLAYER_H__
+ 
+-#include "arm_compute/runtime/CPP/functions/CPPUpsampleEx.h"
++#include "arm_compute/runtime/CPP/functions/CPPUpsample.h"
+ #include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
+ #include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
+-#include "arm_compute/runtime/NEON/functions/NEPermute.h"
++#include "arm_compute/runtime/NEON/functions/NEReverse.h"
+ 
+-#include "arm_compute/core/CPP/kernels/CPPFlipWeightsKernel.h"
+ #include "arm_compute/core/Types.h"
+ #include "arm_compute/runtime/IFunction.h"
+ #include "arm_compute/runtime/IMemoryManager.h"
+@@ -59,8 +57,8 @@ namespace arm_compute
+ {
+ /** Function to run the deconvolution layer.
+  *
+- * Transpose convolution Layer is the backward pass of Convolution Layer. First we transform the
+- * input depending on the stride and pad info and then perfrom a 1x1
++ * Deconvolution Layer is the backward pass of Convolution Layer. First we transform the input
++ * depending on the stride and pad info and then perfrom a 1x1
+  * convolution pass. Input stride defines how many zeroes we should put between each element of the
+  * input, pad is the amount of padding and finaly a is a user
+  * specified value where a < stride - 1 that increases the padding top and right of the input image.
+@@ -81,21 +79,22 @@ namespace arm_compute
+  *      kernel_x and kernel_y are the convolution sizes in x and y.
+  *      stride_x and stride_y is the input stride of the first and second dimension.
+  *
+- * The weights used by Transpose convolution are supposed to be the same as the ones used for
+- * Convolution. Therefore, it will be necessary to use the weights in the
+- * reverse order to perform an actual convolution. This is achieved by using the @ref
+- * CPPFlipWeightsKernel.
++ * The weights used by Deconvolution are supposed to be the same as the ones used for Convolution.
++ * Therefore, it will be necessary to use the weights in the
++ * reverse order to perform an actual convolution. This is achieved by using @ref NEReverse.
+  *
+  * This function calls the following NEON kernels/functions:
+  *
+- * -# @ref CPPUpsample
++ * -# @ref CPPUpsampleEx
+  * -# @ref NEConvolutionLayer
++ * -# @ref NEPermute
++ * -# @ref NEReverse
+  *
+  */
+ class NETransposeConvLayer : public IFunction
+ {
+ public:
+-  /** Default constructor */
++  /** Constructor */
+   NETransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+ 
+   /** Prevent instances of this class from being copied (As this class contains pointers) */
+@@ -112,37 +111,38 @@ public:
+   /** Set the input, weights, biases and output tensors.
+    *
+    * @param[in,out] input   Input tensor. 3 lower dimensions represent a single input, and an
+-   * optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8.
++ * optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED.
+    * @param[in]     weights The 4d weights with dimensions [width, height, IFM, OFM]. Data type
+-   * supported: Same as @p input.
++ * supported: Same as @p input.
+    * @param[in]     bias    Optional, ignored if NULL. The biases have one dimension. Data type
+-   * supported: Data types supported: S32 for QASYMM8 input, F32 for F32 input, F16 for F16 input.
++ * supported: Data types supported: S32 for QASYMM8 and QASYMM8_SIGNED input, F32 for F32 input, F16
++ * for F16 input.
+    * @param[out]    output  Output tensor. The output has the same number of dimensions as the @p
+-   * input.
++ * input.
+    * @param[in]     info    Contains padding and policies to be used in the deconvolution, this is
+-   * decribed in @ref PadStrideInfo.
+-   * @param[in]     invalid_right  The number of zeros added to right edge of the output.
+-   * @param[in]     invalid_bottom The number of zeros added to top edge of the output.
++ * decribed in @ref PadStrideInfo.
++ * @param[in]     invalid_right  The number of zeros added to right edge of the output.
++ * @param[in]     invalid_bottom The number of zeros added to bottom edge of the output.
+    *
+    */
+   void configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output,
+                  const PadStrideInfo &info, unsigned int invalid_right,
+                  unsigned int invalid_bottom);
+   /** Static function to check if given info will lead to a valid configuration of @ref
+-   * NETransposeConvLayer
++ * NETransposeConvLayer
+    *
+    * @param[in] input   Input tensor info. 3 lower dimensions represent a single input, and an
+-   * optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8.
++ * optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED.
+    * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. Data type
+-   * supported: Same as @p input.
++ * supported: Same as @p input.
+    * @param[in] bias    (Optional) The biases have one dimension. Data type supported: Data types
+-   * supported: S32 for QASYMM8 input, F32 for F32 input, F16 for F16 input.
++ * supported: S32 for QASYMM8 and QASYMM8_SIGNED input, F32 for F32 input, F16 for F16 input.
+    * @param[in] output  Output tensor info. The output has the same number of dimensions as the @p
+-   * input.
++ * input.
+    * @param[in] info    Contains padding and policies to be used in the deconvolution, this is
+-   * decribed in @ref PadStrideInfo.
+-   * @param[in] innvalid_right  The number of zeros added to right edge of the output.
+-   * @param[in] invalid_bottom  The number of zeros added to top edge of the output.
++ * decribed in @ref PadStrideInfo.
++ * @param[in] innvalid_right  The number of zeros added to right edge of the output.
++ * @param[in] invalid_bottom  The number of zeros added to bottom edge of the output.
+    *
+    * @return a status
+    */
+@@ -158,17 +158,11 @@ public:
+ private:
+   MemoryGroup _memory_group;
+   NEConvolutionLayer _conv_f;
+-  CPPUpsampleEx _upsample_f;
+-  CPPFlipWeightsKernel _flip_weights;
+-  NEPermute _permute_input;
+-  NEPermute _permute_weights;
+-  NEPermute _permute_output;
++  CPPUpsample _upsample_f;
++  NEReverse _flip_weights;
+   Tensor _scaled_output;
+   Tensor _weights_flipped;
+-  Tensor _permuted_input;
+-  Tensor _permuted_weights;
+-  Tensor _permuted_output;
+-  bool _is_nchw;
++  Tensor _flip_axis;
+   const ITensor *_original_weights;
+   ITensor *_input;
+   PadStrideInfo _info;
+diff --git a/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
+index 7b6b974..ba42a24 100644
+--- a/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
++++ b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
+@@ -55,16 +55,7 @@ using namespace arm_compute;
+ 
+ const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map = {
+     // ARMComputeEx kernels
+-    {"arg_op", "arg_operation.cl"},
+-    {"arithmetic_add_qasymm8", "arithmetic_op_quantized.cl"},
+     {"binary_logical_op", "binary_logical_op.cl"},
+-    {"cast", "cast.cl"},
+-    {"cast_qasymm_in", "cast.cl"},
+-    {"cast_qasymm_out", "cast.cl"},
+-    {"comparison_op", "comparison_op.cl"},
+-    {"comparison_op_qasymm8", "comparison_op_quantized.cl"},
+-    {"depth_to_space_nchw", "depth_to_space.cl"},
+-    {"depth_to_space_nhwc", "depth_to_space.cl"},
+     {"embedding_lookup", "embedding_lookup.cl"},
+     {"gather_ex", "gather_ex.cl"},
+     {"gather_ex_1d", "gather_ex.cl"},
+@@ -74,10 +65,6 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map
+     {"instance_normalization_ex", "instance_normalization_ex.cl"},
+     {"multiply_scale_factor", "multiply_scale_factor.cl"},
+     {"neg_tensor", "neg_tensor.cl"},
+-    {"permute_generic", "permute_ex.cl"},
+-    {"pixelwise_mul_qasymm8", "pixelwise_mul_quantized.cl"},
+-    {"prelu", "prelu.cl"},
+-    {"prelu_qasymm8", "prelu_quantized.cl"},
+     {"quantization_symm8", "quantization_symm8.cl"},
+     {"reduce_min_max", "reduce_operation.cl"},
+     {"reduce_sum_mean", "reduce_operation.cl"},
+@@ -91,29 +78,15 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map
+     {"radixsort_reorder", "topkv2_radixsort.cl"},
+     {"topkv2_quicksort", "topkv2_quicksort.cl"},
+     {"scale_factor_symm8", "scale_factor.cl"},
+-    {"space_to_depth_nchw", "space_to_depth.cl"},
+-    {"space_to_depth_nhwc", "space_to_depth.cl"},
+ };
+ 
+ const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map = {
+ #ifdef EMBEDDED_KERNELS
+     {
+-        "arg_operation.cl",
+-#include "./cl_kernels/arg_operation.clembed"
+-    },
+-    {
+-        "cast.cl",
+-#include "./cl_kernels/cast.clembed"
+-    },
+-    {
+         "embedding_lookup.cl",
+ #include "./cl_kernels/embedding_lookup.clembed"
+     },
+     {
+-        "depth_to_space.cl",
+-#include "./cl_kernels/depth_to_space.clembed"
+-    },
+-    {
+         "gather_ex.cl",
+ #include "./cl_kernels/gather_ex.clembed"
+     },
+@@ -150,14 +123,6 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map
+ #include "./cl_kernels/neg_tensor.clembed"
+     },
+     {
+-        "prelu.cl",
+-#include "./cl_kernels/prelu.clembed"
+-    },
+-    {
+-        "prelu_quantized.cl",
+-#include "./cl_kernels/prelu_quantized.clembed"
+-    },
+-    {
+         "quantization_symm8.cl",
+ #include "./cl_kernels/quantization_symm8.clembed"
+     },
+@@ -170,10 +135,6 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map
+ #include "./cl_kernels/scale_factor.clembed"
+     },
+     {
+-        "space_to_depth.cl",
+-#include "./cl_kernels/space_to_depth.clembed"
+-    },
+-    {
+         "topkv2.cl",
+ #include "./cl_kernels/topkv2.clembed"
+     },
+diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl
+deleted file mode 100644
+index 03717cf..0000000
+--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl
++++ /dev/null
+@@ -1,137 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2017 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "helpers.h"
+-
+-#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE)
+-/** Perform arg_max/arg_min
+- *
+- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type.
+- *       e.g. -DDATA_TYPE=short
+- * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size.
+- *            e.g. -DDEPTH_OUT=16
+- * @attention Operation type(code) specifying which operation to perform should be passed as
+- *            preprocessor argument using -DOP_CODE = number. e.g. -DOP_CODE=1
+- *
+- * @param[in]  input_ptr                            Pointer to the source image. Supported data
+- * types:
+- *                                                  U8/QASYMM8/S8/U16/S16/F16/U32/S32/F32
+- * @param[in]  input_stride_x                       Stride of the source image in X dimension
+- *                                                  (in bytes)
+- * @param[in]  input_step_x                         input_stride_x * number of elements along X
+- *                                                  processed per workitem(in bytes)
+- * @param[in]  input_stride_y                       Stride of the source image in Y dimension
+- *                                                  (in bytes)
+- * @param[in]  input_step_y                         input_stride_y * number of elements along Y
+- *                                                  processed per workitem(in bytes)
+- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension
+- *                                                  (in bytes)
+- * @param[in]  input_step_z                         input_stride_z * number of elements along Z
+- *                                                  processed per workitem(in bytes)
+- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element
+- *                                                  in the source image
+- * @param[in]  input_stride_w                       Stride of the source tensor in W dimension
+- *                                                  (in bytes)
+- * @param[in]  input_step_w                         output_stride_w * number of elements along W
+- *                                                  processed per workitem(in bytes)
+- * @param[out] output_ptr                           Pointer to the destination image.
+- *                                                  Supported data types: U32
+- * @param[in]  output_stride_x                      Stride of the destination image in X dimension
+- *                                                  (in bytes)
+- * @param[in]  output_step_x                        output_stride_x * number of elements along X
+- *                                                  processed per workitem(in bytes)
+- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension
+- *                                                  (in bytes)
+- * @param[in]  output_step_y                        output_stride_y * number of elements along Y
+- *                                                  processed per workitem(in bytes)
+- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension
+- *                                                  (in bytes)
+- * @param[in]  output_step_z                        output_stride_z * number of elements along Z
+- *                                                  processed per workitem(in bytes)
+- * @param[in]  output_stride_w                      Stride of the source tensor in W dimension
+- *                                                  (in bytes)
+- * @param[in]  output_step_w                        output_stride_w * number of elements along W
+- *                                                  processed per workitem(in bytes)
+- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
+- *                                                  destination image
+- * @param[in]  axis                                 Axis through which reduction occurs
+- * @param[in]  dim                                  Dimension across the axis to be reduced.
+- */
+-
+-__kernel void arg_op(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output), const int axis,
+-                     const int dim)
+-{
+-  Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
+-  Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+-
+-  int indices[4] = {
+-      get_global_id(0), get_global_id(1), get_global_id(2) % DEPTH_OUT,
+-      get_global_id(2) / DEPTH_OUT,
+-  };
+-
+-  DATA_TYPE value =
+-      *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]));
+-  DATA_TYPE tval = value;
+-  int idx = 0;
+-  for (int i = 1; i < dim; ++i)
+-  {
+-    indices[axis] = i;
+-
+-#if OP_CODE == 1 // ArgMax
+-    value = max(value, *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1],
+-                                                               indices[2], indices[3])));
+-#elif OP_CODE == 2 // ArgMin
+-    value = min(value, *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1],
+-                                                               indices[2], indices[3])));
+-#else
+-    return;
+-
+-#endif
+-
+-    if (tval != value)
+-    {
+-      idx = indices[axis];
+-      tval = value;
+-    }
+-  }
+-
+-  *((__global uint *)out.ptr) = idx;
+-}
+-#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE)
+diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl
+deleted file mode 100644
+index f74c1c1..0000000
+--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl
++++ /dev/null
+@@ -1,191 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2016, 2017 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "helpers_asymm.h"
+-
+-#ifdef SATURATE
+-#define ADD(x, y) add_sat((x), (y))
+-#define SUB(x, y) sub_sat((x), (y))
+-#else /* SATURATE */
+-#define ADD(x, y) (x) + (y)
+-#define SUB(x, y) (x) - (y)
+-#endif /* SATURATE */
+-
+-/** Performs a pixelwise addition used to quantize down the int32 accumulator values of GEMMLowp to
+- *  QASYMM8
+- *
+- * The following computations will be performed:
+- *
+- *  -# Add offset terms to inputs
+-    -# Get scaled value of two inputs
+- *  -# Add inputs
+- *  -# Add offset terms to final result
+- *  -# Multiply each entry of result by result_mult_int
+- *  -# Shift the int32 accumulator by result_shift
+- *  -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
+- *
+- * @attention The inputs and output data types need to be passed at compile time using
+- *            -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
+- *            e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar
+- * @attention The number of bits to shift left of input tensors must be passed at compile time using
+- *            -DLEFT_SHIFT
+- * @attention The offset, scalar scale factor and number of bits to shift right of input tensors
+- *            must be passed at compile time using -DIN1_OFFSET, -RIN1_MULT_INT, -DIN1_SHIFT,
+- -DIN2_OFFSET,
+- *            -RIN2_MULT_INT and -DIN2_SHIFT
+- * @attention The offset, scalar scale factor and number of bits to shift right of output tensor
+- *            must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and
+- -DRESULT_SHIFT
+- *
+- * @attention The input and output data_types need to be passed at compile time using
+- *            -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
+- *            e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar
+- * @attention The inputs and output scale information of qasymm8 need to be passed at compile time
+- *            using -DSCALE_IN1, -DSCALE_IN2 and -DSCALE_OUT:
+- *            e.g. -DSCALE_IN1=1.f -DSCALE_IN2=1.f -DSCALE_OUT=2.f
+- * @attention The inputs and output scale offset need to be passed at compile time using
+- *            -DOFFSET_IN1, -DOFFSET_IN2 and -DOFFSET_OUT:
+- *            e.g. -DOFFSET_IN1=0 -DOFFSET_IN2=0 -DOFFSET_OUT=0
+- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
+- *            -DVEC_SIZE=16
+- * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise
+- *            wrapping policy will be used.
+- *
+- * @param[in]  in1_ptr                           Pointer to the source tensor.
+- *                                               Supported data types: QASYMM8
+- * @param[in]  in1_stride_x                      Stride of the source tensor in X dimension
+- *                                               (in bytes)
+- * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed
+- *                                               per workitem(in bytes)
+- * @param[in]  in1_stride_y                      Stride of the source tensor in Y dimension
+- *                                               (in bytes)
+- * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed
+- *                                               per workitem(in bytes)
+- * @param[in]  in1_stride_z                      Stride of the source tensor in Z dimension
+- *                                               (in bytes)
+- * @param[in]  in1_step_z                        in1_stride_z * number of elements along Z processed
+- *                                               per workitem(in bytes)
+- * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source
+- *                                               tensor
+- * @param[in]  in2_ptr                           Pointer to the source tensor. Supported data types:
+- *                                               QASYMM8
+- * @param[in]  in2_stride_x                      Stride of the source tensor in X dimension
+- *                                               (in bytes)
+- * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed
+- *                                               per workitem(in bytes)
+- * @param[in]  in2_stride_y                      Stride of the source tensor in Y dimension
+- *                                               (in bytes)
+- * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed
+- *                                               per workitem(in bytes)
+- * @param[in]  in2_stride_z                      Stride of the source tensor in Z dimension
+- *                                               (in bytes)
+- * @param[in]  in2_step_z                        in2_stride_z * number of elements along Z processed
+- *                                               per workitem(in bytes)
+- * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source
+- *                                               tensor
+- * @param[out] out_ptr                           Pointer to the destination tensor.
+- *                                               Supported data types: QASYMM8
+- * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension
+- *                                               (in bytes)
+- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed
+- *                                               per workitem(in bytes)
+- * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension
+- *                                               (in bytes)
+- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed
+- *                                               per workitem(in bytes)
+- * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension
+- *                                               (in bytes)
+- * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed
+- *                                               per workitem(in bytes)
+- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination
+- *                                               tensor
+- */
+-__kernel void arithmetic_add_qasymm8(TENSOR3D_DECLARATION(in1), TENSOR3D_DECLARATION(in2),
+-                                     TENSOR3D_DECLARATION(out))
+-{
+-  // Get pixels pointer
+-  Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
+-  Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
+-  Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+-
+-  // Load data
+-  VEC_DATA_TYPE(int, 16)
+-  in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(int, 16));
+-  VEC_DATA_TYPE(int, 16)
+-  in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(int, 16));
+-
+-  // Get scaled value of two inputs
+-  VEC_DATA_TYPE(int, 16) in1_val = in1_data + (VEC_DATA_TYPE(int, 16))(IN1_OFFSET);
+-  VEC_DATA_TYPE(int, 16) in2_val = in2_data + (VEC_DATA_TYPE(int, 16))(IN2_OFFSET);
+-
+-  VEC_DATA_TYPE(int, 16)
+-  left_shift = (VEC_DATA_TYPE(int, 16))1 << (VEC_DATA_TYPE(int, 16))(LEFT_SHIFT);
+-  VEC_DATA_TYPE(int, 16) shifted_in1_val = in1_val * left_shift;
+-  VEC_DATA_TYPE(int, 16) shifted_in2_val = in2_val * left_shift;
+-
+-  VEC_DATA_TYPE(int, 16)
+-  scaled_in1_val =
+-      ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(shifted_in1_val, IN1_MULT_INT, IN1_SHIFT, 16);
+-  VEC_DATA_TYPE(int, 16)
+-  scaled_in2_val =
+-      ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(shifted_in2_val, IN2_MULT_INT, IN2_SHIFT, 16);
+-
+-  // Add inputs and multiply with a multiplier smaller than 1
+-  VEC_DATA_TYPE(int, 16) sum_val = scaled_in1_val + scaled_in2_val;
+-  VEC_DATA_TYPE(int, 16)
+-  out_val =
+-      ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(sum_val, RESULT_MULT_INT, RESULT_SHIFT, 16);
+-  out_val += (VEC_DATA_TYPE(int, 16))(RESULT_OFFSET);
+-
+-  VEC_DATA_TYPE(uchar, 16) res = CONVERT(out_val, VEC_DATA_TYPE(uchar, 16));
+-
+-  // TODO: Apply min-max BOUND to support fuse with relu.
+-  /*
+-  #if defined(MIN_BOUND)
+-      res = max(res, (uchar16)MIN_BOUND);
+-  #endif // defined(MIN_BOUND)
+-  #if defined(MAX_BOUND)
+-      res = min(res, (uchar16)MAX_BOUND);
+-  #endif // defined(MAX_BOUND)
+-  */
+-
+-  // Store result
+-  VSTORE(16)(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), 0, (__global DATA_TYPE_OUT *)out.ptr);
+-}
+diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl
+deleted file mode 100644
+index 4147a00..0000000
+--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl
++++ /dev/null
+@@ -1,233 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2017 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "helpers.h"
+-
+-#ifndef SCALE
+-#define SCALE 1.0f
+-#endif
+-#ifndef OFFSET
+-#define OFFSET 0
+-#endif
+-#ifndef VEC_SIZE
+-#define VEC_SIZE 1
+-#endif
+-
+-#if defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT)
+-/** Perform a cast operation on an input tensor.
+- *
+- * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and
+- *            -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int
+- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
+- *            -DVEC_SIZE=16
+- * @attention -DBOOL_INPUT : Whether type of input is bool.
+- *
+- * @param[in]  input_ptr                            Pointer to the source image. Supported data
+- *                                                  types: F16/F32
+- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in
+- *                                                  bytes)
+- * @param[in]  input_step_x                         input_stride_x * number of elements along X
+- *                                                  processed per workitem(in bytes)
+- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in
+- *                                                  bytes)
+- * @param[in]  input_step_y                         input_stride_y * number of elements along Y
+- *                                                  processed per workitem(in bytes)
+- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in
+- *                                                  bytes)
+- * @param[in]  input_step_z                         input_stride_z * number of elements along Z
+- *                                                  processed per workitem(in bytes)
+- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source
+- *                                                  image
+- * @param[out] output_ptr                           Pointer to the destination image. Supported data
+- *                                                  types: same as @p input_ptr
+- * @param[in]  output_stride_x                      Stride of the destination image in X dimension
+- *                                                  (in bytes)
+- * @param[in]  output_step_x                        output_stride_x * number of elements along X
+- *                                                  processed per workitem(in bytes)
+- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension
+- *                                                  (in bytes)
+- * @param[in]  output_step_y                        output_stride_y * number of elements along Y
+- *                                                  processed per workitem(in bytes)
+- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in
+- *                                                  bytes)
+- * @param[in]  output_step_z                        output_stride_z * number of elements along Z
+- *                                                  processed per workitem(in bytes)
+- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
+- *                                                  destination image
+- */
+-__kernel void cast(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output))
+-{
+-  Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+-  Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+-
+-  VSTORE(VEC_SIZE)
+-  (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr),
+-           VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)),
+-   0, (__global DATA_TYPE_OUT *)output.ptr);
+-  VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)
+-  res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr),
+-                VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE));
+-#if defined(BOOL_INPUT)
+-  VEC_DATA_TYPE(char, VEC_SIZE) tmp = CONVERT(res, VEC_DATA_TYPE(char, VEC_SIZE));
+-  VEC_DATA_TYPE(char, VEC_SIZE) mask = (VEC_DATA_TYPE(char, VEC_SIZE))(1);
+-  res = CONVERT(tmp & mask, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE));
+-#endif // defined(BOOL_INPUT)
+-
+-  VSTORE(VEC_SIZE)(res, 0, (__global DATA_TYPE_OUT *)output.ptr);
+-}
+-
+-/** Perform a cast operation on an QASYMM8 input tensor.
+- * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and
+- *            -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int
+- * @attention Offset and Scale of input should be given as a preprocessor argument using
+- *            -DOFFSET=int, -DSCALE=float. e.g. -DOFFSET=1, -DSCALE=0.5
+- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
+- *            -DVEC_SIZE=16
+- *
+- * @param[in]  input_ptr                            Pointer to the source image. Supported data
+- *                                                  types: F16/F32
+- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in
+- *                                                  bytes)
+- * @param[in]  input_step_x                         input_stride_x * number of elements along X
+- *                                                  processed per workitem(in bytes)
+- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in
+- *                                                  bytes)
+- * @param[in]  input_step_y                         input_stride_y * number of elements along Y
+- *                                                  processed per workitem(in bytes)
+- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in
+- *                                                  bytes)
+- * @param[in]  input_step_z                         input_stride_z * number of elements along Z
+- *                                                  processed per workitem(in bytes)
+- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source
+- *                                                  image
+- * @param[out] output_ptr                           Pointer to the destination image. Supported data
+- *                                                  types: same as @p input_ptr
+- * @param[in]  output_stride_x                      Stride of the destination image in X dimension
+- *                                                  (in bytes)
+- * @param[in]  output_step_x                        output_stride_x * number of elements along X
+- *                                                  processed per workitem(in bytes)
+- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension
+- *                                                  (in bytes)
+- * @param[in]  output_step_y                        output_stride_y * number of elements along Y
+- *                                                  processed per workitem(in bytes)
+- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in
+- *                                                  bytes)
+- * @param[in]  output_step_z                        output_stride_z * number of elements along Z
+- *                                                  processed per workitem(in bytes)
+- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
+- *                                                  destination image
+- */
+-__kernel void cast_qasymm_in(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output))
+-{
+-  Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+-  Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+-
+-  VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)
+-  in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr);
+-  VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET);
+-  VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE);
+-
+-  VEC_DATA_TYPE(int, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(int, VEC_SIZE)) - offset;
+-  VEC_DATA_TYPE(float, VEC_SIZE) out_data = CONVERT(tmp, VEC_DATA_TYPE(float, VEC_SIZE)) * scale;
+-
+-  VSTORE(VEC_SIZE)
+-  (CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0,
+-   (__global DATA_TYPE_OUT *)output.ptr);
+-}
+-
+-/** Perform a cast operation on an QASYMM8 output tensor.
+- * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and
+- *            -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int
+- * @attention Offset and Scale of output should be given as a preprocessor argument using
+- *            -DOFFSET=int, -DSCALE=float. e.g. -DOFFSET=1, -DSCALE=0.5
+- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
+- *            -DVEC_SIZE=16
+- *
+- * @param[in]  input_ptr                            Pointer to the source image. Supported data
+- *                                                  types: F16/F32
+- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in
+- *                                                 bytes)
+- * @param[in]  input_step_x                         input_stride_x * number of elements along X
+- *                                                  processed per workitem(in bytes)
+- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in
+- *                                                  bytes)
+- * @param[in]  input_step_y                         input_stride_y * number of elements along Y
+- *                                                  processed per workitem(in bytes)
+- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in
+- *                                                  bytes)
+- * @param[in]  input_step_z                         input_stride_z * number of elements along Z
+- *                                                  processed per workitem(in bytes)
+- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source
+- *                                                  image
+- * @param[out] output_ptr                           Pointer to the destination image. Supported data
+- *                                                  types: U8
+- * @param[in]  output_stride_x                      Stride of the destination image in X dimension
+- *                                                  (in bytes)
+- * @param[in]  output_step_x                        output_stride_x * number of elements along X
+- *                                                  processed per workitem(in bytes)
+- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension
+- *                                                  (in bytes)
+- * @param[in]  output_step_y                        output_stride_y * number of elements along Y
+- *                                                  processed per workitem(in bytes)
+- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in
+- *                                                  bytes)
+- * @param[in]  output_step_z                        output_stride_z * number of elements along Z
+- *                                                  processed per workitem(in bytes)
+- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
+- *                                                  destination image
+- */
+-__kernel void cast_qasymm_out(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output))
+-{
+-  Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+-  Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+-
+-  VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)
+-  in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr);
+-  VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET);
+-  VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE);
+-
+-  VEC_DATA_TYPE(float, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(float, VEC_SIZE)) / scale;
+-  VEC_DATA_TYPE(float, VEC_SIZE) out_data = tmp + CONVERT(offset, VEC_DATA_TYPE(float, VEC_SIZE));
+-
+-  VSTORE(VEC_SIZE)
+-  (CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0,
+-   (__global DATA_TYPE_OUT *)output.ptr);
+-}
+-#endif // defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT)
+diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl
+deleted file mode 100644
+index 0285c95..0000000
+--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl
++++ /dev/null
+@@ -1,185 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2016, 2017 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "helpers.h"
+-
+-#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT)
+-/** Perform space to depth rearrangement of tensor
+- *
+- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+- * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size.
+- *            e.g. -DDEPTH_OUT=16
+- * @attention The value of the z-axis of output tensor should be given as a preprocessor argument
+- *            using -DZ_OUT=size. e.g. -DZ_OUT=16
+- * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g.
+- *            -DBLOCK_SIZE=1
+- *
+- * @param[in]  input_ptr                            Pointer to the source image. Supported data
+- *                                                  types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in
+- *                                                  bytes)
+- * @param[in]  input_step_x                         input_stride_x * number of elements along X
+- *                                                  processed per workitem(in  bytes)
+- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in
+- *                                                  bytes)
+- * @param[in]  input_step_y                         input_stride_y * number of elements along Y
+- *                                                  processed per workitem(in  bytes)
+- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in
+- *                                                  bytes)
+- * @param[in]  input_step_z                         input_stride_z * number of elements along Z
+- *                                                  processed per workitem(in  bytes)
+- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source
+- *                                                  image
+- * @param[out] output_ptr                           Pointer to the destination image. Supported data
+- *                                                  types: same as @p input_ptr
+- * @param[in]  output_stride_x                      Stride of the destination image in X dimension
+- *                                                  (in bytes)
+- * @param[in]  output_step_x                        output_stride_x * number of elements along X
+- *                                                  processed per workitem(in bytes)
+- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension
+- *                                                  (in bytes)
+- * @param[in]  output_step_y                        output_stride_y * number of elements along Y
+- *                                                  processed per workitem(in bytes)
+- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in
+- *                                                  bytes)
+- * @param[in]  output_step_z                        output_stride_z * number of elements along Z
+- *                                                  processed per workitem(in bytes)
+- * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in
+- *                                                  bytes)
+- * @param[in]  output_step_w                        output_stride_w * number of elements along W
+- *                                                  processed per workitem(in bytes)
+- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
+- *                                                  destination image
+- */
+-__kernel void depth_to_space_nchw(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output))
+-{
+-  Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+-  Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, Z_OUT);
+-
+-  int out_index[4] = {0};
+-  int in_index[4] = {0};
+-
+-  out_index[0] = get_global_id(0);         // W
+-  out_index[1] = get_global_id(1);         // H
+-  out_index[2] = get_global_id(2) % Z_OUT; // C
+-  out_index[3] = get_global_id(2) / Z_OUT; // B
+-
+-  in_index[0] = out_index[0] / BLOCK_SIZE;
+-  in_index[1] = out_index[1] / BLOCK_SIZE;
+-  in_index[2] = out_index[2] +
+-                ((out_index[1] % BLOCK_SIZE) * BLOCK_SIZE + out_index[0] % BLOCK_SIZE) * DEPTH_OUT;
+-  in_index[3] = out_index[3];
+-
+-  *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(
+-      &in, in_index[0], in_index[1], in_index[2], in_index[3]));
+-}
+-#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT)
+-
+-#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT)
+-/** Perform space to depth rearrangement of tensor (NHWC)
+- *
+- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+- * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size.
+- *            e.g. -DDEPTH_OUT=16
+- * @attention The value of the z-axis of output tensor should be given as a preprocessor argument
+- *            using -DZ_OUT=size. e.g. -DZ_OUT=16
+- * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g.
+- *            -DBLOCK_SIZE=1
+- *
+- * @param[in]  input_ptr                            Pointer to the source image. Supported data
+- *                                                  types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in
+- *                                                  bytes)
+- * @param[in]  input_step_x                         input_stride_x * number of elements along X
+- *                                                  processed per workitem(in  bytes)
+- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in
+- *                                                  bytes)
+- * @param[in]  input_step_y                         input_stride_y * number of elements along Y
+- *                                                  processed per workitem(in  bytes)
+- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in
+- *                                                  bytes)
+- * @param[in]  input_step_z                         input_stride_z * number of elements along Z
+- *                                                  processed per workitem(in  bytes)
+- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source
+- *                                                  image
+- * @param[out] output_ptr                           Pointer to the destination image. Supported data
+- *                                                  types: same as @p input_ptr
+- * @param[in]  output_stride_x                      Stride of the destination image in X dimension
+- *                                                  (in bytes)
+- * @param[in]  output_step_x                        output_stride_x * number of elements along X
+- *                                                  processed per workitem(in bytes)
+- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension
+- *                                                  (in bytes)
+- * @param[in]  output_step_y                        output_stride_y * number of elements along Y
+- *                                                  processed per workitem(in bytes)
+- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in
+- *                                                  bytes)
+- * @param[in]  output_step_z                        output_stride_z * number of elements along Z
+- *                                                  processed per workitem(in bytes)
+- * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in
+- *                                                  bytes)
+- * @param[in]  output_step_w                        output_stride_w * number of elements along W
+- *                                                  processed per workitem(in bytes)
+- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
+- *                                                  destination image
+- */
+-__kernel void depth_to_space_nhwc(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output))
+-{
+-  Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+-  Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, Z_OUT);
+-
+-  int out_index[4] = {0};
+-  int in_index[4] = {0};
+-
+-  out_index[0] = get_global_id(0);         // C
+-  out_index[1] = get_global_id(1);         // W
+-  out_index[2] = get_global_id(2) % Z_OUT; // H
+-  out_index[3] = get_global_id(2) / Z_OUT; // B
+-
+-  in_index[0] = out_index[0] +
+-                ((out_index[2] % BLOCK_SIZE) * BLOCK_SIZE + out_index[1] % BLOCK_SIZE) * DEPTH_OUT;
+-  in_index[1] = out_index[1] / BLOCK_SIZE;
+-  in_index[2] = out_index[2] / BLOCK_SIZE;
+-  in_index[3] = out_index[3];
+-
+-  *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(
+-      &in, in_index[0], in_index[1], in_index[2], in_index[3]));
+-}
+-#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT)
+diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h
+index 2d0b6a2..e07a25e 100644
+--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h
++++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h
+@@ -15,7 +15,7 @@
+  */
+ 
+ /*
+- * Copyright (c) 2016-2018 ARM Limited.
++ * Copyright (c) 2016-2020 ARM Limited.
+  *
+  * SPDX-License-Identifier: MIT
+  *
+@@ -37,7 +37,6 @@
+  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+  * SOFTWARE.
+  */
+-
+ #ifndef ARM_COMPUTE_HELPER_H
+ #define ARM_COMPUTE_HELPER_H
+ 
+@@ -59,16 +58,219 @@
+ #pragma OPENCL EXTENSION cl_arm_printf : enable
+ #endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
+ 
++#define GPU_ARCH_MIDGARD 0x100
++#define GPU_ARCH_BIFROST 0x200
++
++/** Concatenate two inputs.
++ *
++ * @param[in] a The first input to be concatenated
++ * @param[in] b The second input to be concatenated
++ *
++ * @return The concatenated output
++ */
++#define CONCAT(a, b) a##b
++
++/** Expand the given vector
++ *
++ * @param[in] x The vector to be expanded
++ *
++ * @return The expanded output
++ */
+ #define EXPAND(x) x
+ 
++/** Clamp the given value between an upper and lower bound.
++ *
++ * @param[in] x       The value to be clamped
++ * @param[in] min_val The lower bound
++ * @param[in] max_val The upper bound
++ *
++ * @return The clamped value.
++ */
+ #define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+ 
++/** REVn reverses the given vector whose size is n.
++ * @name REVn
++ *
++ * @param[in] x The vector to be reversed
++ *
++ * @return The reversed vector
++ * @{
++ */
++#define REV1(x) ((x))
++#define REV2(x) ((x).s10)
++#define REV3(x) ((x).s210)
++#define REV4(x) ((x).s3210)
++#define REV8(x) ((x).s76543210)
++#define REV16(x) ((x).sFEDCBA9876543210)
++/** @} */ // end of group REVn
++
++/** Reverse the given vector.
++ * @name REVERSE
++ *
++ * @param[in] x The vector to be reversed
++ * @param[in] s The size of the vector
++ *
++ * @return The reversed vector
++ * @{
++ */
++#define REVERSE_STR(x, s) REV##s((x))
++#define REVERSE(x, s) REVERSE_STR(x, s)
++/** @} */ // end of group REVERSE
++
++/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
++ * @name ROTs_n
++ *
++ * @param[in] x The vector to be shifted
++ *
++ * @return The shifted vector
++ * @{
++ */
++#define ROT1_0(x) ((x))
++
++#define ROT2_0(x) ((x))
++#define ROT2_1(x) ((x).s10)
++
++#define ROT3_0(x) ((x))
++#define ROT3_1(x) ((x).s201)
++#define ROT3_2(x) ((x).s120)
++
++#define ROT4_0(x) ((x))
++#define ROT4_1(x) ((x).s3012)
++#define ROT4_2(x) ((x).s2301)
++#define ROT4_3(x) ((x).s1230)
++
++#define ROT8_0(x) ((x))
++#define ROT8_1(x) ((x).s70123456)
++#define ROT8_2(x) ((x).s67012345)
++#define ROT8_3(x) ((x).s56701234)
++#define ROT8_4(x) ((x).s45670123)
++#define ROT8_5(x) ((x).s34567012)
++#define ROT8_6(x) ((x).s23456701)
++#define ROT8_7(x) ((x).s12345670)
++
++#define ROT16_0(x) ((x))
++#define ROT16_1(x) ((x).sF0123456789ABCDE)
++#define ROT16_2(x) ((x).sEF0123456789ABCD)
++#define ROT16_3(x) ((x).sDEF0123456789ABC)
++#define ROT16_4(x) ((x).sCDEF0123456789AB)
++#define ROT16_5(x) ((x).sBCDEF0123456789A)
++#define ROT16_6(x) ((x).sABCDEF0123456789)
++#define ROT16_7(x) ((x).s9ABCDEF012345678)
++#define ROT16_8(x) ((x).s89ABCDEF01234567)
++#define ROT16_9(x) ((x).s789ABCDEF0123456)
++#define ROT16_10(x) ((x).s6789ABCDEF012345)
++#define ROT16_11(x) ((x).s56789ABCDEF01234)
++#define ROT16_12(x) ((x).s456789ABCDEF0123)
++#define ROT16_13(x) ((x).s3456789ABCDEF012)
++#define ROT16_14(x) ((x).s23456789ABCDEF01)
++#define ROT16_15(x) ((x).s123456789ABCDEF0)
++/** @} */ // end of group ROTs_n
++
++/** Circular-right-shift (rotate-right) the given vector by the given amount.
++ * @name ROTATE
++ *
++ * @param[in] x The vector to be shifted
++ * @param[in] s The size of the vector
++ * @param[in] n The amount to be shifted
++ *
++ * @return The shifted vector
++ * @{
++ */
++#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
++#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
++/** @} */ // end of group ROTATE
++
++/** Creates a vector of size n filled with offset values corresponding to the location of each
++ * element.
++ * @name V_OFFSn
++ *
++ * @param[in] dt The data type of the output vector
++ *
++ * @return The vector filled with offset values
++ * @{
++ */
++#define V_OFFS1(dt) (dt)(0)
++#define V_OFFS2(dt) (dt)(0, 1)
++#define V_OFFS3(dt) (dt)(0, 1, 3)
++#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
++#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
++#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
++/** @} */ // end of group V_OFFSn
++
++/** Create a vector filled with offset values corresponding to the location of each element.
++ * @name VEC_OFFS
++ *
++ * @param[in] dt The data type of the output vector
++ * @param[in] s  The size of the output vector
++ *
++ * @return The vector filled with offset values
++ * @{
++ */
++#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
++#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
++/** @} */ // end of group VEC_OFFS
++
+ #define VLOAD_STR(size) vload##size
+ #define VLOAD(size) VLOAD_STR(size)
+ 
+ #define VSTORE_STR(size) vstore##size
+ #define VSTORE(size) VSTORE_STR(size)
+ 
++#define float1 float
++#define half1 half
++#define char1 char
++#define uchar1 uchar
++#define short1 short
++#define ushort1 ushort
++#define int1 int
++#define uint1 uint
++#define long1 long
++#define ulong1 ulong
++#define double1 double
++
++#define vload1(OFFSET, PTR) *(OFFSET + PTR)
++#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
++
++// Convert built-in functions with _sat modifier are not supported in floating point so we create
++// defines
++// without _sat to overcome this issue
++#define convert_float_sat convert_float
++#define convert_float1_sat convert_float
++#define convert_float2_sat convert_float2
++#define convert_float3_sat convert_float3
++#define convert_float4_sat convert_float4
++#define convert_float8_sat convert_float8
++#define convert_float16_sat convert_float16
++#define convert_half_sat convert_float
++#define convert_half1_sat convert_half
++#define convert_half2_sat convert_half2
++#define convert_half3_sat convert_half3
++#define convert_half4_sat convert_half4
++#define convert_half8_sat convert_half8
++#define convert_half16_sat convert_half16
++
++#define convert_float1 convert_float
++#define convert_half1 convert_half
++#define convert_char1 convert_char
++#define convert_uchar1 convert_uchar
++#define convert_short1 convert_short
++#define convert_ushort1 convert_ushort
++#define convert_int1 convert_int
++#define convert_uint1 convert_uint
++#define convert_long1 convert_long
++#define convert_ulong1 convert_ulong
++#define convert_double1 convert_double
++
++#define convert_char1_sat convert_char_sat
++#define convert_uchar1_sat convert_uchar_sat
++#define convert_short1_sat convert_short_sat
++#define convert_ushort1_sat convert_ushort_sat
++#define convert_int1_sat convert_int_sat
++#define convert_uint1_sat convert_uint_sat
++#define convert_long1_sat convert_long_sat
++#define convert_ulong1_sat convert_ulong_sat
++#define convert_double1_sat convert_double_sat
++
+ #define VEC_DATA_TYPE_STR(type, size) type##size
+ #define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+ 
+diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h
+index a83b1a8..5f1b3f9 100644
+--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h
++++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h
+@@ -15,7 +15,7 @@
+  */
+ 
+ /*
+- * Copyright (c) 2017-2018 ARM Limited.
++ * Copyright (c) 2017-2020 ARM Limited.
+  *
+  * SPDX-License-Identifier: MIT
+  *
+@@ -37,29 +37,112 @@
+  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+  * SOFTWARE.
+  */
+-
+ #ifndef ARM_COMPUTE_HELPERS_ASYMM_H
+ #define ARM_COMPUTE_HELPERS_ASYMM_H
+ 
+ #include "helpers.h"
+ 
++/** Convert the given vector with round to nearest even rounding mode
++ *
++ * @param[in] x    The target to be converted
++ * @param[in] type The target type
++ *
++ * @return The converted vector
++ */
++#define CONVERT_DOWN_RTE_STR(x, type) (convert_##type##_rte((x)))
++#define CONVERT_DOWN_RTE(x, type) CONVERT_DOWN_RTE_STR(x, type)
++
++/** Quantize a floating-point scalar value to 8-bit asymmetric
++ *
++ * @param[in] input  Input value to quantize
++ * @param[in] offset Quantization offset
++ * @param[in] scale  Quantization scale
++ *
++ * @return quantized value
++ */
++inline uchar quantize_qasymm8(float input, float offset, float scale)
++{
++  float out_f32 = input / scale + offset;
++  uchar res_u8 = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, int), uchar);
++  return res_u8;
++}
++
++/** Dequantize a scalar value from 8-bit asymmetric to floating-point
++ *
++ * @param[in] input  Input value to quantize
++ * @param[in] offset Quantization offset
++ * @param[in] scale  Quantization scale
++ *
++ * @return quantized value
++ */
++inline float dequantize_qasymm8(uchar input, float offset, float scale)
++{
++  return ((float)input - offset) * scale;
++}
++
++/** Dequantize a scalar value from signed 8-bit asymmetric to floating-point
++ *
++ * @param[in] input  Input value to quantize
++ * @param[in] offset Quantization offset
++ * @param[in] scale  Quantization scale
++ *
++ * @return quantized value
++ */
++inline float dequantize_qasymm8_signed(char input, float offset, float scale)
++{
++  return ((float)input - offset) * scale;
++}
++
++/** Quantize a vector of values from floating-point
++ *
++ * @param[in] type Output data type.
++ * @param[in] size Size of vector.
++ *
++ * @return quantized values
++ */
++#define QUANTIZE_IMPL(type, size)                                                                 \
++  inline VEC_DATA_TYPE(type, size)                                                                \
++      quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale)          \
++  {                                                                                               \
++    VEC_DATA_TYPE(float, size)                                                                    \
++    out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset); \
++    VEC_DATA_TYPE(type, size)                                                                     \
++    res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)),                        \
++                      VEC_DATA_TYPE(type, size));                                                 \
++    return res;                                                                                   \
++  }
++
++/** Dequantize a vector of values to floating-point
++ *
++ * @param[in] type Input data type.
++ * @param[in] size Size of vector.
++ *
++ * @return dequantized values in floating point
++ */
++#define DEQUANTIZE_IMPL(type, size)                                                       \
++  inline VEC_DATA_TYPE(float, size)                                                       \
++      dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \
++  {                                                                                       \
++    return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale;                 \
++  }
++
+ /** Correctly-rounded-to-nearest division by a power-of-two.
+  *
+  * @param[in] size Size of vector.
+  *
+  * @return Correctly-rounded-to-nearest division by a power-of-two.
+  */
+-#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size)                                     \
+-  inline VEC_DATA_TYPE(int, size)                                                    \
+-      asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, int exponent) \
+-  {                                                                                  \
+-    VEC_DATA_TYPE(int, size)                                                         \
+-    mask = (1 << exponent) - 1;                                                      \
+-    const VEC_DATA_TYPE(int, size) zero = 0;                                         \
+-    const VEC_DATA_TYPE(int, size) one = 1;                                          \
+-    VEC_DATA_TYPE(int, size)                                                         \
+-    threshold = (mask >> 1) + select(zero, one, x < 0);                              \
+-    return (x >> exponent) + select(zero, one, (x & mask) > threshold);              \
++#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size)                        \
++  inline VEC_DATA_TYPE(int, size) asymm_rounding_divide_by_POW2_##size( \
++      VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent)    \
++  {                                                                     \
++    const VEC_DATA_TYPE(int, size) zero = (VEC_DATA_TYPE(int, size))0;  \
++    const VEC_DATA_TYPE(int, size) one = (VEC_DATA_TYPE(int, size))1;   \
++    VEC_DATA_TYPE(int, size)                                            \
++    mask = (one << exponent) - one;                                     \
++    VEC_DATA_TYPE(int, size)                                            \
++    threshold = (mask >> 1) + select(zero, one, x < 0);                 \
++    return (x >> exponent) + select(zero, one, (x & mask) > threshold); \
+   }
+ 
+ /** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1),
+@@ -81,9 +164,19 @@
+     b_64 = convert_long##size(b);                                              \
+     VEC_DATA_TYPE(long, size)                                                  \
+     ab_64 = a_64 * b_64;                                                       \
+-    /* COMPMID-907 */                                                          \
++    /* Revert COMPMID-907 */                                                   \
++    VEC_DATA_TYPE(long, size)                                                  \
++    mask1 = 1 << 30;                                                           \
++    VEC_DATA_TYPE(long, size)                                                  \
++    mask2 = 1 - (1 << 30);                                                     \
++    VEC_DATA_TYPE(long, size)                                                  \
++    is_positive_or_zero = ab_64 >= 0;                                          \
++    VEC_DATA_TYPE(long, size)                                                  \
++    nudge = select(mask2, mask1, is_positive_or_zero);                         \
++    VEC_DATA_TYPE(long, size)                                                  \
++    mask = 1ll << 31;                                                          \
+     VEC_DATA_TYPE(int, size)                                                   \
+-    ab_x2_high32 = convert_int##size(((ab_64 + (1 << 30)) >> 31));             \
++    ab_x2_high32 = convert_int##size((ab_64 + nudge) / mask);                  \
+     return select(ab_x2_high32, INT_MAX, overflow);                            \
+   }
+ 
+@@ -335,9 +428,18 @@
+     return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size);                         \
+   }
+ 
++#define QUANTIZE_STR(input, offset, scale, type, size) quantize_##type##size(input, offset, scale)
++#define QUANTIZE(input, offset, scale, type, size) QUANTIZE_STR(input, offset, scale, type, size)
++#define DEQUANTIZE_STR(input, offset, scale, type, size) \
++  dequantize_##type##size(input, offset, scale)
++#define DEQUANTIZE(input, offset, scale, type, size) \
++  DEQUANTIZE_STR(input, offset, scale, type, size)
++
+ #define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) \
+   asymm_rounding_divide_by_POW2_##size(x, exponent)
+ #define ASYMM_MULT(a, b, size) asymm_mult##size(a, b)
++#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size) \
++  ASYMM_MULT(x *((VEC_DATA_TYPE(int, size))(1) << (-left_shift)), quantized_multiplier, size)
+ #define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \
+   ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size)
+ #define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) \
+@@ -360,11 +462,53 @@
+ #define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) \
+   asymm_rescale##size(value, src_integer_bits, dst_integer_bits)
+ 
++#define MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(size)                                               \
++  inline VEC_DATA_TYPE(int, size)                                                                 \
++      multiply_by_quantized_multiplier##size(VEC_DATA_TYPE(int, size) input, int qmul, int shift) \
++  {                                                                                               \
++    const int left_shift = shift > 0 ? shift : 0;                                                 \
++    const int right_shift = shift > 0 ? 0 : -shift;                                               \
++    return ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(input * (1 << left_shift), qmul, size),       \
++                                         right_shift, size);                                      \
++  }
++#define MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, size) \
++  multiply_by_quantized_multiplier##size(input, qmul, shift)
++
++QUANTIZE_IMPL(uchar, 1)
++QUANTIZE_IMPL(char, 1)
++QUANTIZE_IMPL(uint, 1)
++QUANTIZE_IMPL(int, 1)
++QUANTIZE_IMPL(uchar, 4)
++QUANTIZE_IMPL(ushort, 4)
++QUANTIZE_IMPL(short, 4)
++QUANTIZE_IMPL(uchar, 16)
++QUANTIZE_IMPL(char, 16)
++QUANTIZE_IMPL(ushort, 16)
++QUANTIZE_IMPL(short, 16)
++QUANTIZE_IMPL(uint, 16)
++QUANTIZE_IMPL(int, 16)
++
++DEQUANTIZE_IMPL(uchar, 1)
++DEQUANTIZE_IMPL(char, 1)
++DEQUANTIZE_IMPL(uint, 1)
++DEQUANTIZE_IMPL(int, 1)
++DEQUANTIZE_IMPL(uchar, 4)
++DEQUANTIZE_IMPL(ushort, 4)
++DEQUANTIZE_IMPL(short, 4)
++DEQUANTIZE_IMPL(uchar, 16)
++DEQUANTIZE_IMPL(char, 16)
++DEQUANTIZE_IMPL(ushort, 16)
++DEQUANTIZE_IMPL(short, 16)
++DEQUANTIZE_IMPL(uint, 16)
++DEQUANTIZE_IMPL(int, 16)
++
++ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(1)
+ ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(2)
+ ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(4)
+ ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(8)
+ ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(16)
+ 
++ASYMM_MULT_IMPL(1)
+ ASYMM_MULT_IMPL(2)
+ ASYMM_MULT_IMPL(4)
+ ASYMM_MULT_IMPL(8)
+@@ -375,16 +519,19 @@ ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(4)
+ ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(8)
+ ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(16)
+ 
++ASYMM_SELECT_USING_MASK_IMPL(1)
+ ASYMM_SELECT_USING_MASK_IMPL(2)
+ ASYMM_SELECT_USING_MASK_IMPL(4)
+ ASYMM_SELECT_USING_MASK_IMPL(8)
+ ASYMM_SELECT_USING_MASK_IMPL(16)
+ 
++ASYMM_MASK_IF_ZERO_IMPL(1)
+ ASYMM_MASK_IF_ZERO_IMPL(2)
+ ASYMM_MASK_IF_ZERO_IMPL(4)
+ ASYMM_MASK_IF_ZERO_IMPL(8)
+ ASYMM_MASK_IF_ZERO_IMPL(16)
+ 
++ASYMM_MASK_IF_NON_ZERO_IMPL(1)
+ ASYMM_MASK_IF_NON_ZERO_IMPL(2)
+ ASYMM_MASK_IF_NON_ZERO_IMPL(4)
+ ASYMM_MASK_IF_NON_ZERO_IMPL(8)
+@@ -400,6 +547,7 @@ ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(4)
+ ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(8)
+ ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(16)
+ 
++ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(1)
+ ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(2)
+ ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(4)
+ ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(8)
+@@ -415,9 +563,16 @@ ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(4)
+ ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(8)
+ ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(16)
+ 
++ASYMM_RESCALE_IMPL(1)
+ ASYMM_RESCALE_IMPL(2)
+ ASYMM_RESCALE_IMPL(4)
+ ASYMM_RESCALE_IMPL(8)
+ ASYMM_RESCALE_IMPL(16)
+ 
++MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(1)
++MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(2)
++MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(4)
++MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(8)
++MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(16)
++
+ #endif // ARM_COMPUTE_HELPERS_ASYMM_H
+diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl
+deleted file mode 100644
+index 12c8eeb..0000000
+--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl
++++ /dev/null
+@@ -1,120 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2016-2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "helpers.h"
+-
+-#ifndef VEC_SIZE
+-#define VEC_SIZE 1
+-#endif
+-
+-#if defined(DATA_TYPE)
+-/** Returns result of prelu function implemented as below:
+- * f(input) = alpha * input for input < 0, f(input) = input for input >= 0.
+- *
+- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
+- *            -DVEC_SIZE=16
+- * @note Can only take floating point data types.
+- *
+- * @param[in]  input1_ptr                            Pointer to the source image. Supported Data
+- *                                                   types : F16/F32
+- * @param[in]  input1_stride_x                       Stride of the source image in X dimension (in
+- *                                                   bytes)
+- * @param[in]  input1_step_x                         input1_stride_x * number of elements along X
+- *                                                   processed per workitem(in bytes)
+- * @param[in]  input1_stride_y                       Stride of the source image in Y dimension (in
+- *                                                   bytes)
+- * @param[in]  input1_step_y                         input1_stride_y * number of elements along Y
+- *                                                   processed per workitem(in bytes)
+- * @param[in]  input1_stride_z                       Stride of the source tensor in Z dimension (in
+- *                                                   bytes)
+- * @param[in]  input1_step_z                         input1_stride_z * number of elements along Z
+- *                                                   processed per workitem(in bytes)
+- * @param[in]  input1_offset_first_element_in_bytes  The offset of the first element in the source
+- *                                                   image
+- * @param[in]  alpha_ptr                             Pointer to the source image. Supported Data
+- *                                                   types : F16/F32
+- * @param[in]  alpha_stride_x                        Stride of the source image in X dimension (in
+- *                                                   bytes)
+- * @param[in]  alpha_step_x                          input2_stride_x * number of elements along X
+- *                                                   processed per workitem(in bytes)
+- * @param[in]  alpha_stride_y                        Stride of the source image in Y dimension (in
+- *                                                   bytes)
+- * @param[in]  alpha_step_y                          input2_stride_y * number of elements along Y
+- *                                                   processed per workitem(in bytes)
+- * @param[in]  alpha_stride_z                        Stride of the source tensor in Z dimension (in
+- *                                                   bytes)
+- * @param[in]  alpha_step_z                          input2_stride_z * number of elements along Z
+- *                                                   processed per workitem(in bytes)
+- * @param[in]  alpha_offset_first_element_in_bytes   The offset of the first element in the source
+- *                                                   image
+- *
+- * @param[out] output_ptr                            Pointer to the destination image. Supported
+- *                                                   data types: same as @p input_ptr
+- * @param[in]  output_stride_x                       Stride of the destination image in X dimension
+- *                                                   (in bytes)
+- * @param[in]  output_step_x                         output_stride_x * number of elements along X
+- *                                                   processed per workitem(in bytes)
+- * @param[in]  output_stride_y                       Stride of the destination image in Y dimension
+- *                                                   (in bytes)
+- * @param[in]  output_step_y                         output_stride_y * number of elements along Y
+- *                                                   processed per workitem(in bytes)
+- * @param[in]  output_stride_z                       Stride of the source tensor in Z dimension (in
+- *                                                   bytes)
+- * @param[in]  output_step_z                         output_stride_z * number of elements along Z
+- *                                                   processed per workitem(in bytes)
+- * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the
+- *                                                   destination image
+- */
+-__kernel void prelu(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(alpha),
+-                    TENSOR3D_DECLARATION(output))
+-{
+-  Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+-  Tensor3D alpha = CONVERT_TO_TENSOR3D_STRUCT(alpha);
+-  Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+-
+-  VSTORE(VEC_SIZE)
+-  (VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr) < 0
+-       ? VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr) *
+-             VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)alpha.ptr)
+-       : VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr),
+-   0, (__global DATA_TYPE *)output.ptr);
+-}
+-#endif // defined(DATA_TYPE)
+diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl
+deleted file mode 100644
+index a66e107..0000000
+--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl
++++ /dev/null
+@@ -1,138 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2016-2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "helpers.h"
+-#define SUB(x, y) (x) - (y)
+-
+-#if defined(OFF_IN) && defined(OFF_ALPHA) && defined(OFF_OUT) && defined(SCALE_IN) && \
+-    defined(SCALE_ALPHA) && defined(SCALE_OUT) && defined(VEC_SIZE)
+-
+-#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
+-#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
+-#define VEC_UCHAR VEC_DATA_TYPE(uchar, VEC_SIZE)
+-#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
+-#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
+-#define SELECT_TYPE VEC_INT
+-
+-/** Returns result of prelu function implemented as below:
+- *  f(input) = alpha * input for input < 0, f(input) = input for input >= 0.
+- *
+- * @attention Data type can be passed using the -DDATA_TYPE_IN compile flag, e.g.
+- *            -DDATA_TYPE_IN=uchar
+- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
+- *            -DVEC_SIZE=16
+- * @note Can only take uchar data types.
+- *
+- * @param[in]  input1_ptr                            Pointer to the source image. Supported Data
+- *                                                   types : QASYMM8
+- * @param[in]  input1_stride_x                       Stride of the source image in X dimension (in
+- *                                                   bytes)
+- * @param[in]  input1_step_x                         input1_stride_x * number of elements along X
+- *                                                   processed per workitem(in bytes)
+- * @param[in]  input1_stride_y                       Stride of the source image in Y dimension (in
+- *                                                   bytes)
+- * @param[in]  input1_step_y                         input1_stride_y * number of elements along Y
+- *                                                   processed per workitem(in bytes)
+- * @param[in]  input1_stride_z                       Stride of the source tensor in Z dimension (in
+- *                                                   bytes)
+- * @param[in]  input1_step_z                         input1_stride_z * number of elements along Z
+- *                                                   processed per workitem(in bytes)
+- * @param[in]  input1_offset_first_element_in_bytes  The offset of the first element in the source
+- *                                                   image
+- * @param[in]  alpha_ptr                             Pointer to the source image. Supported Data
+- *                                                   types : QASYMM8
+- * @param[in]  alpha_stride_x                        Stride of the source image in X dimension (in
+- *                                                   bytes)
+- * @param[in]  alpha_step_x                          input2_stride_x * number of elements along X
+- *                                                   processed per workitem(in bytes)
+- * @param[in]  alpha_stride_y                        Stride of the source image in Y dimension (in
+- *                                                   bytes)
+- * @param[in]  alpha_step_y                          input2_stride_y * number of elements along Y
+- *                                                   processed per workitem(in bytes)
+- * @param[in]  alpha_stride_z                        Stride of the source tensor in Z dimension (in
+- *                                                   bytes)
+- * @param[in]  alpha_step_z                          input2_stride_z * number of elements along Z
+- *                                                   processed per workitem(in bytes)
+- * @param[in]  alpha_offset_first_element_in_bytes   The offset of the first element in the source
+- *                                                   image
+- * @param[out] output_ptr                            Pointer to the destination image. Supported
+- *                                                   data types: same as @p input_ptr
+- * @param[in]  output_stride_x                       Stride of the destination image in X dimension
+- *                                                   (in bytes)
+- * @param[in]  output_step_x                         output_stride_x * number of elements along X
+- *                                                   processed per workitem(in bytes)
+- * @param[in]  output_stride_y                       Stride of the destination image in Y dimension
+- *                                                   (in bytes)
+- * @param[in]  output_step_y                         output_stride_y * number of elements along Y
+- *                                                   processed per workitem(in bytes)
+- * @param[in]  output_stride_z                       Stride of the source tensor in Z dimension (in
+- *                                                   bytes)
+- * @param[in]  output_step_z                         output_stride_z * number of elements along Z
+- *                                                   processed per workitem(in bytes)
+- * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the
+- *                                                   destination image
+- */
+-__kernel void prelu_qasymm8(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(alpha),
+-                            TENSOR3D_DECLARATION(output))
+-{
+-  // Get pixels pointer
+-  Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+-  Tensor3D alpha = CONVERT_TO_TENSOR3D_STRUCT(alpha);
+-  Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+-
+-  VEC_INT in_vec = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)input.ptr), VEC_INT);
+-  VEC_INT alpha_vec = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)alpha.ptr), VEC_INT);
+-
+-  in_vec = SUB(in_vec, (VEC_INT)((int)OFF_IN));
+-  alpha_vec = SUB(alpha_vec, (VEC_INT)((int)OFF_ALPHA));
+-
+-  const VEC_FLOAT inf32 = CONVERT(in_vec, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN);
+-  const VEC_FLOAT alphaf32 = CONVERT(alpha_vec, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_ALPHA);
+-  const VEC_FLOAT outf32 =
+-      select(inf32, inf32 * alphaf32, CONVERT(inf32 < (VEC_FLOAT)0, SELECT_TYPE));
+-  const VEC_FLOAT qresf32 = outf32 / ((VEC_FLOAT)(float)SCALE_OUT) + ((VEC_FLOAT)((float)OFF_OUT));
+-  const VEC_UCHAR res = CONVERT_SAT(CONVERT_DOWN(qresf32, VEC_INT), VEC_UCHAR);
+-
+-  VSTORE(VEC_SIZE)
+-  (res, 0, (__global uchar *)output.ptr);
+-}
+-
+-#endif // defined(OFF_IN) && defined(OFF_ALPHA) && defined(OFF_OUT) && defined(SCALE_IN) &&
+-       // defined(SCALE_ALPHA) && defined(SCALE_OUT) && defined(VEC_SIZE)
+diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl
+deleted file mode 100644
+index eb612f8..0000000
+--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl
++++ /dev/null
+@@ -1,185 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2016, 2017 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "helpers.h"
+-
+-#if defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE) && defined(Z_IN)
+-/** Perform space to depth rearrangement of tensor
+- *
+- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+- * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size.
+- *            e.g. -DDEPTH_IN=16
+- * @attention The value of the z-axis of input tensor depth should be given as a preprocessor
+- *            argument using -DZ_IN=size. e.g. -DZ_IN=16
+- * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g.
+- *            -DBLOCK_SIZE=1
+- *
+- * @param[in]  input_ptr                            Pointer to the source image. Supported data
+- *                                                  types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in
+- *                                                  bytes)
+- * @param[in]  input_step_x                         input_stride_x * number of elements along X
+- *                                                  processed per workitem(in  bytes)
+- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in
+- *                                                  bytes)
+- * @param[in]  input_step_y                         input_stride_y * number of elements along Y
+- *                                                  processed per workitem(in  bytes)
+- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in
+- *                                                  bytes)
+- * @param[in]  input_step_z                         input_stride_z * number of elements along Z
+- *                                                  processed per workitem(in  bytes)
+- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source
+- *                                                  image
+- * @param[out] output_ptr                           Pointer to the destination image. Supported data
+- *                                                  types: same as @p input_ptr
+- * @param[in]  output_stride_x                      Stride of the destination image in X dimension
+- *                                                  (in bytes)
+- * @param[in]  output_step_x                        output_stride_x * number of elements along X
+- *                                                  processed per workitem(in bytes)
+- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension
+- *                                                  (in bytes)
+- * @param[in]  output_step_y                        output_stride_y * number of elements along Y
+- *                                                  processed per workitem(in bytes)
+- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in
+- *                                                  bytes)
+- * @param[in]  output_step_z                        output_stride_z * number of elements along Z
+- *                                                  processed per workitem(in bytes)
+- * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in
+- *                                                  bytes)
+- * @param[in]  output_step_w                        output_stride_w * number of elements along W
+- *                                                  processed per workitem(in bytes)
+- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
+- *                                                  destination image
+- */
+-__kernel void space_to_depth_nchw(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output))
+-{
+-  Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, Z_IN);
+-  Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+-
+-  int out_index[4] = {0};
+-  int in_index[4] = {0};
+-
+-  in_index[0] = get_global_id(0);        // W
+-  in_index[1] = get_global_id(1);        // H
+-  in_index[2] = get_global_id(2) % Z_IN; // C
+-  in_index[3] = get_global_id(2) / Z_IN; // B
+-
+-  out_index[0] = in_index[0] / BLOCK_SIZE;
+-  out_index[1] = in_index[1] / BLOCK_SIZE;
+-  out_index[2] =
+-      in_index[2] + ((in_index[1] % BLOCK_SIZE) * BLOCK_SIZE + in_index[0] % BLOCK_SIZE) * DEPTH_IN;
+-  out_index[3] = in_index[3];
+-
+-  *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0], out_index[1], out_index[2],
+-                                          out_index[3])) = *((__global DATA_TYPE *)in.ptr);
+-}
+-#endif // defined(DATA_TYPE) && defined(Z_IN) && defined(BLOCK_SIZE) && defined(Z_IN)
+-
+-#if defined(DATA_TYPE) && defined(Z_IN) && defined(BLOCK_SIZE) && defined(Z_IN)
+-/** Perform space to depth rearrangement of tensor
+- *
+- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+- * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size.
+- *            e.g. -DDEPTH_IN=16
+- * @attention The value of the z-axis of input tensor depth should be given as a preprocessor
+- *            argument using -DZ_IN=size. e.g. -DZ_IN=16
+- * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g.
+- *            -DBLOCK_SIZE=1
+- *
+- * @param[in]  input_ptr                            Pointer to the source image. Supported data
+- *                                                  types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in
+- *                                                  bytes)
+- * @param[in]  input_step_x                         input_stride_x * number of elements along X
+- *                                                  processed per workitem(in  bytes)
+- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in
+- *                                                  bytes)
+- * @param[in]  input_step_y                         input_stride_y * number of elements along Y
+- *                                                  processed per workitem(in  bytes)
+- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in
+- *                                                  bytes)
+- * @param[in]  input_step_z                         input_stride_z * number of elements along Z
+- *                                                  processed per workitem(in  bytes)
+- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source
+- *                                                  image
+- * @param[out] output_ptr                           Pointer to the destination image. Supported data
+- *                                                  types: same as @p input_ptr
+- * @param[in]  output_stride_x                      Stride of the destination image in X dimension
+- *                                                  (in bytes)
+- * @param[in]  output_step_x                        output_stride_x * number of elements along X
+- *                                                  processed per workitem(in bytes)
+- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension
+- *                                                  (in bytes)
+- * @param[in]  output_step_y                        output_stride_y * number of elements along Y
+- *                                                  processed per workitem(in bytes)
+- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in
+- *                                                  bytes)
+- * @param[in]  output_step_z                        output_stride_z * number of elements along Z
+- *                                                  processed per workitem(in bytes)
+- * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in
+- *                                                  bytes)
+- * @param[in]  output_step_w                        output_stride_w * number of elements along W
+- *                                                  processed per workitem(in bytes)
+- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
+- *                                                  destination image
+- */
+-__kernel void space_to_depth_nhwc(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output))
+-{
+-  Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, Z_IN);
+-  Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+-
+-  int out_index[4] = {0};
+-  int in_index[4] = {0};
+-
+-  in_index[0] = get_global_id(0);        // C
+-  in_index[1] = get_global_id(1);        // W
+-  in_index[2] = get_global_id(2) % Z_IN; // H
+-  in_index[3] = get_global_id(2) / Z_IN; // B
+-
+-  out_index[0] =
+-      in_index[0] + ((in_index[2] % BLOCK_SIZE) * BLOCK_SIZE + in_index[1] % BLOCK_SIZE) * DEPTH_IN;
+-  out_index[1] = in_index[1] / BLOCK_SIZE;
+-  out_index[2] = in_index[2] / BLOCK_SIZE;
+-  out_index[3] = in_index[3];
+-
+-  *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0], out_index[1], out_index[2],
+-                                          out_index[3])) = *((__global DATA_TYPE *)in.ptr);
+-}
+-#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE) && defined(Z_IN)
+diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLArgOperationKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLArgOperationKernel.cpp
+deleted file mode 100644
+index 06eeb5b..0000000
+--- a/compute/ARMComputeEx/src/core/CL/kernels/CLArgOperationKernel.cpp
++++ /dev/null
+@@ -1,181 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2016-2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/core/CL/kernels/CLArgOperationKernel.h"
+-
+-#include "arm_compute/core/CL/CLHelpers.h"
+-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+-#include "arm_compute/core/CL/ICLTensor.h"
+-
+-using namespace arm_compute;
+-
+-namespace
+-{
+-const TensorShape inferOutputShape(const TensorShape &input_shape, const uint32_t axis)
+-{
+-  TensorShape out_shape{input_shape};
+-
+-  out_shape.set(axis, 1);
+-
+-  return out_shape;
+-}
+-} // namespace
+-
+-namespace
+-{
+-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis,
+-                          ArgOperation /*op*/)
+-{
+-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::S32, DataType::F32, DataType::U8,
+-                                        DataType::QASYMM8);
+-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::S32);
+-
+-  ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape().num_dimensions() - 1) !=
+-                                      output->tensor_shape().num_dimensions(),
+-                                  "Input's rank is not same with output");
+-
+-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0,
+-                                  "Inputs are not broadcast compatible");
+-
+-  const TensorShape output_shape = inferOutputShape(input->tensor_shape(), axis);
+-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_shape.total_size() != output->tensor_shape().total_size(),
+-                                  "output shape's size does not match axis");
+-
+-  const auto num_dimensions = input->tensor_shape().num_dimensions();
+-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= num_dimensions, "axis must be less than (input's rank).");
+-  return Status{};
+-}
+-
+-} // namespace
+-
+-CLArgOperationKernel::CLArgOperationKernel() : _input(nullptr), _output(nullptr), _axis() {}
+-
+-void CLArgOperationKernel::configure(const ICLTensor *input, ICLTensor *output, const uint32_t axis,
+-                                     ArgOperation op)
+-{
+-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op));
+-
+-  _input = input;
+-  _output = output;
+-  _axis = axis;
+-
+-  std::unique_ptr<ITensorInfo> output_info = output->info()->clone();
+-  output_info->set_tensor_shape(inferOutputShape(input->info()->tensor_shape(), axis));
+-
+-  // Construct kernel and set op_code based on type of ArgOperation as specified by object op
+-  std::string kernel_name = "arg_op";
+-  int op_code = 0;
+-  if (op == ArgOperation::MAX)
+-  {
+-    op_code = 1;
+-  }
+-  else if (op == ArgOperation::MIN)
+-  {
+-    op_code = 2;
+-  }
+-  else
+-    throw std::runtime_error("Operation not supported, yet");
+-
+-  // Set kernel build options
+-  std::set<std::string> build_opts;
+-  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+-  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output_info->dimension(2)));
+-  build_opts.emplace("-DOP_CODE=" + support::cpp11::to_string(op_code));
+-
+-  // Create kernel
+-  _kernel =
+-      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+-
+-  // Configure  kernel window
+-  Window win = calculate_max_window(*output_info, Steps());
+-
+-  Coordinates coord;
+-  coord.set_num_dimensions(output_info->num_dimensions());
+-  output->info()->set_valid_region(ValidRegion(coord, output_info->tensor_shape()));
+-
+-  ICLKernel::configure_internal(win);
+-}
+-
+-Status CLArgOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+-                                      const uint32_t axis, ArgOperation op)
+-{
+-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
+-
+-  return Status{};
+-}
+-
+-void CLArgOperationKernel::run(const Window &window, cl::CommandQueue &queue)
+-{
+-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+-
+-  const TensorShape &shape_in = _input->info()->tensor_shape();
+-
+-  unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters
+-
+-  _kernel.setArg<cl_int>(idx++, _axis);
+-  _kernel.setArg<cl_int>(idx++, shape_in[_axis]);
+-
+-  Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+-
+-  // Setup input slice
+-  Window slice_in(slice_out);
+-  slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+-  slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+-  slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+-  slice_in.set(3, Window::Dimension(0, 0, 0));
+-
+-  // Copy output's shape in order to use for recovering at end of this method
+-  const TensorShape shape_out = _output->info()->tensor_shape();
+-  _output->info()->set_tensor_shape(inferOutputShape(shape_in, _axis));
+-
+-  do
+-  {
+-    unsigned int idx = 0;
+-    add_4D_tensor_argument(idx, _input, slice_in);
+-    add_4D_tensor_argument(idx, _output, slice_out);
+-    enqueue(queue, *this, slice_out);
+-  } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
+-
+-  // Recover output's shape of output tensor
+-  _output->info()->set_tensor_shape(shape_out);
+-}
+diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
+index bb55568..fbc76f5 100644
+--- a/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
++++ b/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
+@@ -43,6 +43,7 @@
+ #include "arm_compute/core/CL/CLHelpers.h"
+ #include "arm_compute/core/CL/CLKernelLibraryEx.h"
+ #include "arm_compute/core/CL/ICLTensor.h"
++#include "support/StringSupport.h"
+ 
+ using namespace arm_compute;
+ 
+diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp
+deleted file mode 100644
+index 01ea655..0000000
+--- a/compute/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp
++++ /dev/null
+@@ -1,132 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2016-2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/core/CL/kernels/CLCastKernel.h"
+-
+-#include "arm_compute/core/CL/CLHelpers.h"
+-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+-#include "arm_compute/core/CL/ICLTensor.h"
+-
+-using namespace arm_compute;
+-
+-CLCastKernel::CLCastKernel() : _input(nullptr), _output(nullptr) {}
+-
+-void CLCastKernel::configure(const ICLTensor *input, ICLTensor *output, SubDataType input_subtype)
+-{
+-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+-
+-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
+-                                                DataType::S16, DataType::S32, DataType::F16,
+-                                                DataType::F32);
+-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
+-                                                DataType::S16, DataType::S32, DataType::F16,
+-                                                DataType::F32);
+-  ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+-
+-  _input = input;
+-  _output = output;
+-
+-  constexpr unsigned int num_elems_processed_per_iteration = 16;
+-
+-  // Set kernel build options
+-  CLBuildOptions build_opts;
+-  build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type()));
+-  build_opts.add_option("-DDATA_TYPE_OUT=" +
+-                        get_cl_type_from_data_type(output->info()->data_type()));
+-  build_opts.add_option(
+-      ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+-
+-  // Create kernel
+-  if (is_data_type_quantized_asymmetric(input->info()->data_type()))
+-  {
+-    UniformQuantizationInfo qinfo = input->info()->quantization_info().uniform();
+-    const float scale_in = qinfo.scale;
+-    const int offset_in = qinfo.offset;
+-    build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(scale_in));
+-    build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(offset_in));
+-
+-    _kernel = static_cast<cl::Kernel>(
+-        CLKernelLibraryEx::get().create_kernel("cast_qasymm_in", build_opts.options()));
+-  }
+-  else if (is_data_type_quantized_asymmetric(output->info()->data_type()))
+-  {
+-    UniformQuantizationInfo qinfo = output->info()->quantization_info().uniform();
+-    const float scale_in = qinfo.scale;
+-    const float offset_in = qinfo.offset;
+-
+-    build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(scale_in));
+-    build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(offset_in));
+-
+-    _kernel = static_cast<cl::Kernel>(
+-        CLKernelLibraryEx::get().create_kernel("cast_qasymm_out", build_opts.options()));
+-  }
+-  else
+-  {
+-    build_opts.add_option_if(input_subtype == SubDataType::BOOL, "-DBOOL_INPUT");
+-    _kernel = static_cast<cl::Kernel>(
+-        CLKernelLibraryEx::get().create_kernel("cast", build_opts.options()));
+-  }
+-
+-  // Configure kernel window
+-  Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+-  AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+-  AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+-  update_window_and_padding(win, input_access, output_access);
+-  output_access.set_valid_region(win, input->info()->valid_region());
+-
+-  ICLKernel::configure_internal(win);
+-}
+-
+-void CLCastKernel::run(const Window &window, cl::CommandQueue &queue)
+-{
+-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+-
+-  Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+-  Window slice = collapsed.first_slice_window_3D();
+-
+-  do
+-  {
+-    unsigned int idx = 0;
+-    add_3D_tensor_argument(idx, _input, slice);
+-    add_3D_tensor_argument(idx, _output, slice);
+-    enqueue(queue, *this, slice, lws_hint());
+-  } while (collapsed.slide_window_slice_3D(slice));
+-}
+diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp
+deleted file mode 100644
+index 3891368..0000000
+--- a/compute/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp
++++ /dev/null
+@@ -1,140 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2016-2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h"
+-
+-#include "arm_compute/core/CL/CLHelpers.h"
+-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+-#include "arm_compute/core/CL/ICLTensor.h"
+-
+-using namespace arm_compute;
+-
+-namespace
+-{
+-// TODO Use this validation function
+-#if 0
+-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+-                          const int32_t block_size)
+-{
+-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
+-                                                DataType::S16, DataType::S32, DataType::F16,
+-                                                DataType::F32);
+-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
+-                                                DataType::S16, DataType::S32, DataType::F16,
+-                                                DataType::F32);
+-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size < 1,
+-                                  "Block size should be greater than or equal to 1.");
+-
+-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) != input->dimension(0) * block_size,
+-                                  "Output width should be equal to (Input width * block size)");
+-
+-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(1) != input->dimension(1) * block_size,
+-                                  "Output height should be equal to (Input height * block size)");
+-
+-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) % (block_size * block_size) != 0,
+-                                  "Input depth should be divisible by (block size * block size)");
+-
+-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+-      output->dimension(2) != input->dimension(2) / (block_size * block_size),
+-      "Output depth should be equal to (Input depth / (block size * block size))");
+-
+-  return Status{};
+-}
+-#endif
+-} // namespace
+-
+-CLDepthToSpaceKernel::CLDepthToSpaceKernel() : _input(nullptr), _output(nullptr)
+-{
+-  // DO NOTHING
+-}
+-
+-void CLDepthToSpaceKernel::configure(const ICLTensor *input, ICLTensor *output,
+-                                     const int32_t block_size)
+-{
+-  // TODO Add validation of data_layout
+-  _input = input;
+-  _output = output;
+-
+-  // Set kernel build options
+-  auto layout_out = output->info()->data_layout();
+-  std::set<std::string> build_opts;
+-  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+-  build_opts.emplace("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size));
+-  auto index_depth = get_data_layout_dimension_index(layout_out, DataLayoutDimension::CHANNEL);
+-  auto depth = output->info()->dimension(index_depth);
+-  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(depth));
+-  build_opts.emplace("-DZ_OUT=" + support::cpp11::to_string(output->info()->tensor_shape().z()));
+-
+-  // Create kernel
+-  _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(
+-      "depth_to_space_" + lower_string(string_from_data_layout(layout_out)), build_opts));
+-
+-  // Configure  kernel window
+-  Window win = calculate_max_window(*output->info(), Steps());
+-
+-  Coordinates coord;
+-  coord.set_num_dimensions(output->info()->num_dimensions());
+-  output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+-
+-  ICLKernel::configure_internal(win);
+-}
+-
+-void CLDepthToSpaceKernel::run(const Window &window, cl::CommandQueue &queue)
+-{
+-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+-  ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+-
+-  Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+-
+-  // Setup input slice
+-  Window slice_in(slice_out);
+-  slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+-  slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+-  slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+-  slice_in.set(3, Window::Dimension(0, 0, 0));
+-
+-  do
+-  {
+-    unsigned int idx = 0;
+-    add_4D_tensor_argument(idx, _input, slice_in);
+-    add_4D_tensor_argument(idx, _output, slice_out);
+-    enqueue(queue, *this, slice_out);
+-  } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
+-}
+diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
+index 79f5ce0..67aaf2d 100644
+--- a/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
++++ b/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
+@@ -43,6 +43,7 @@
+ #include "arm_compute/core/CL/CLHelpers.h"
+ #include "arm_compute/core/CL/CLKernelLibraryEx.h"
+ #include "arm_compute/core/CL/ICLTensor.h"
++#include "support/StringSupport.h"
+ 
+ using namespace arm_compute;
+ 
+diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.cpp
+deleted file mode 100644
+index 235e897..0000000
+--- a/compute/ARMComputeEx/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.cpp
++++ /dev/null
+@@ -1,372 +0,0 @@
+-/*
+- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2017-2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.h"
+-
+-#include "arm_compute/core/AccessWindowStatic.h"
+-#include "arm_compute/core/AccessWindowTranspose.h"
+-#include "arm_compute/core/CL/CLHelpers.h"
+-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+-#include "arm_compute/core/CL/ICLTensor.h"
+-#include "arm_compute/core/CL/OpenCL.h"
+-#include "arm_compute/core/Error.h"
+-#include "arm_compute/core/Helpers.h"
+-#include "arm_compute/core/TensorInfo.h"
+-#include "arm_compute/core/Types.h"
+-#include "arm_compute/core/Utils.h"
+-#include "arm_compute/core/Validate.h"
+-#include "arm_compute/core/Window.h"
+-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+-#include "support/ToolchainSupport.h"
+-
+-#include <cstddef>
+-#include <cstdint>
+-#include <tuple>
+-
+-using namespace arm_compute;
+-using namespace arm_compute::misc::shape_calculator;
+-
+-namespace arm_compute
+-{
+-class Coordinates;
+-} // namespace arm_compute
+-
+-namespace
+-{
+-using ElementsProcessed = Steps;
+-
+-Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1,
+-                          const ITensorInfo *output, const GEMMReshapeInfo &gemm_info)
+-{
+-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
+-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::S8);
+-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
+-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4,
+-                                  "The number of dimensions for the matrix A must be <= 4");
+-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3,
+-                                  "The number of dimensions for the matrix B must be <= 3");
+-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 2 &&
+-                                      gemm_info.reinterpret_input_as_3d(),
+-                                  "The input1 tensor cannot have more than 2 dimensions if input0 "
+-                                  "has to be reinterpreted as 3D");
+-
+-  const int m = gemm_info.m();
+-  const int n = gemm_info.n();
+-  const int k = gemm_info.k();
+-
+-  ARM_COMPUTE_UNUSED(m);
+-  ARM_COMPUTE_UNUSED(n);
+-  ARM_COMPUTE_UNUSED(k);
+-
+-  ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != static_cast<unsigned int>(k));
+-  ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) != static_cast<unsigned int>(n));
+-  ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(1) != static_cast<unsigned int>(k));
+-  if (gemm_info.reinterpret_input_as_3d())
+-  {
+-    ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) * input0->dimension(2) !=
+-                                static_cast<unsigned int>(m));
+-  }
+-  else
+-  {
+-    ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != static_cast<unsigned int>(m));
+-  }
+-
+-  if (output->total_size() != 0)
+-  {
+-    const TensorInfo tensor_info_output =
+-        output->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, false, gemm_info));
+-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
+-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+-  }
+-
+-  return Status{};
+-}
+-
+-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1,
+-                                                        ITensorInfo *output,
+-                                                        const GEMMReshapeInfo &gemm_info,
+-                                                        ElementsProcessed &num_elements_processed)
+-{
+-  unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
+-  unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
+-  bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+-  bool reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
+-
+-  Window win{};
+-  Window win_out{};
+-  bool window_changed = false;
+-
+-  // In case both input and output have to be reinterpreted as 3D tensors,
+-  // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
+-  if (reinterpret_input_as_3d == reinterpret_output_as_3d)
+-  {
+-    reinterpret_input_as_3d = false;
+-    reinterpret_output_as_3d = false;
+-  }
+-
+-  // Output tensor auto inizialitation if not yet initialized
+-  auto_init_if_empty(*output,
+-                     input0->clone()
+-                         ->set_tensor_shape(compute_mm_shape(*input0, *input1, false, gemm_info))
+-                         .set_data_type(DataType::S32));
+-
+-  TensorInfo tmp_info(*output);
+-
+-  if (reinterpret_output_as_3d)
+-  {
+-    // Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D
+-    // GEMM,
+-    // the window needs to be constructed on the 2D collapsed version of the tensor
+-    TensorShape tmp_shape(output->tensor_shape());
+-    tmp_shape.collapse(2U, 1U);
+-    tmp_info.set_tensor_shape(tmp_shape);
+-  }
+-
+-  // Special case for 1xN, 2xN, 3xN and 4xN input0 tensor. num_elems_processed_per_iteration_x
+-  // Note: if the dot product instruction is available, the 8x2 tile has to be used
+-  num_elems_processed_per_iteration_x = 4;
+-  num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->dimension(1)), 4);
+-
+-  // Note: bottom paddings are calculated manually as the output can be reinterpreted as 3D tensor
+-  // The only way to set properly the paddings, it is to set those explicitly through the
+-  // AccessWindowStatic
+-  const int m = reinterpret_input_as_3d ? input0->tensor_shape()[1] * input0->tensor_shape()[2]
+-                                        : input0->tensor_shape()[1];
+-  const int bottom_pad =
+-      (num_elems_processed_per_iteration_y - (m % num_elems_processed_per_iteration_y)) %
+-      num_elems_processed_per_iteration_y;
+-
+-  // Configure window
+-  win = calculate_max_window(
+-      tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+-  win_out = calculate_max_window(
+-      *output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+-
+-  AccessWindowStatic input0_access(input0, 0, 0, input0->dimension(0),
+-                                   input0->dimension(1) + bottom_pad);
+-  AccessWindowStatic input1_access(
+-      input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x),
+-      input1->dimension(1));
+-  AccessWindowStatic output_access(
+-      output, 0, 0, ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration_x),
+-      output->dimension(1) + bottom_pad);
+-
+-  window_changed =
+-      update_window_and_padding(win, input0_access,
+-                                input1_access) || // window used by the execute_window_loop
+-      update_window_and_padding(
+-          win_out,
+-          output_access); // window used to update the padding requirements of output tensor
+-
+-  Coordinates coord;
+-  coord.set_num_dimensions(output->num_dimensions());
+-  output_access.set_valid_region(win_out, ValidRegion(coord, output->tensor_shape()));
+-
+-  // Collapse along the Z direction
+-  // This collapse needs to be here in order to tune the Z dimension of LWS
+-  Window collapsed = win;
+-  const unsigned int dimension_to_collapse =
+-      std::min(static_cast<unsigned int>(output->num_dimensions()), 2u);
+-  collapsed = win.collapse(win, dimension_to_collapse);
+-
+-  Status err = (window_changed)
+-                   ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+-                   : Status{};
+-  return std::make_pair(err, collapsed);
+-}
+-} // namespace
+-
+-CLGEMMLowpMatrixMultiplyKernelEx::CLGEMMLowpMatrixMultiplyKernelEx()
+-    : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true),
+-      _reinterpret_input_as_3d(false), _reinterpret_output_as_3d(false)
+-{
+-}
+-
+-void CLGEMMLowpMatrixMultiplyKernelEx::configure(const ICLTensor *input0, const ICLTensor *input1,
+-                                                 ICLTensor *output,
+-                                                 const GEMMReshapeInfo &gemm_info)
+-{
+-  ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
+-
+-  ARM_COMPUTE_ERROR_THROW_ON(
+-      validate_arguments(input0->info(), input1->info(), output->info(), gemm_info));
+-
+-  _input0 = input0;
+-  _input1 = input1;
+-  _output = output;
+-  _reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+-  _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
+-
+-  // In case both input and output have to be reinterpreted as 3D tensors,
+-  // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
+-  if (_reinterpret_input_as_3d == _reinterpret_output_as_3d)
+-  {
+-    _reinterpret_input_as_3d = false;
+-    _reinterpret_output_as_3d = false;
+-  }
+-
+-  // Check if we need to slide the matrix B
+-  const unsigned int num_dimensions_input0 = _reinterpret_input_as_3d
+-                                                 ? _input0->info()->num_dimensions() - 1
+-                                                 : _input0->info()->num_dimensions();
+-  _slide_matrix_b = (_input1->info()->num_dimensions() >= num_dimensions_input0);
+-
+-  ElementsProcessed num_elements_processed{};
+-
+-  // Configure kernel window
+-  auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(),
+-                                                  gemm_info, num_elements_processed);
+-  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+-  ICLKernel::configure_internal(win_config.second);
+-
+-  // Create build options
+-  std::string kernel_name(" ");
+-  CLBuildOptions build_opts;
+-  build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
+-  build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
+-  build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
+-                           "-DHEIGHT_GEMM3D=" +
+-                               support::cpp11::to_string(output->info()->dimension(1)));
+-  build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
+-                           "-DDEPTH_GEMM3D=" +
+-                               support::cpp11::to_string(output->info()->dimension(2)));
+-  build_opts.add_option_if(!_slide_matrix_b,
+-                           "-DMATRIX_B_DEPTH=" +
+-                               support::cpp11::to_string(input1->info()->dimension(2)));
+-  build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(input0->info()->dimension(0)));
+-  build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_X=" +
+-                        support::cpp11::to_string(num_elements_processed.x()));
+-  build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_Y=" +
+-                        support::cpp11::to_string(num_elements_processed.y()));
+-
+-  kernel_name = "gemmlowp_mm_midgard_ex";
+-
+-  // Create kernel
+-  _kernel = static_cast<cl::Kernel>(
+-      CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts.options()));
+-
+-  // Set config_id for enabling LWS tuning
+-  _config_id = kernel_name;
+-  _config_id += "_";
+-  _config_id += (_reinterpret_input_as_3d ? "3di_" : "");
+-  _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
+-  _config_id += lower_string(string_from_data_type(input0->info()->data_type()));
+-  _config_id += "_";
+-  _config_id += support::cpp11::to_string(output->info()->dimension(1));
+-  _config_id += "_";
+-  _config_id += support::cpp11::to_string(output->info()->dimension(0));
+-}
+-
+-Status CLGEMMLowpMatrixMultiplyKernelEx::validate(const ITensorInfo *input0,
+-                                                  const ITensorInfo *input1,
+-                                                  const ITensorInfo *output,
+-                                                  const GEMMReshapeInfo &gemm_info)
+-{
+-  ElementsProcessed num_elements_processed{};
+-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, gemm_info));
+-  ARM_COMPUTE_RETURN_ON_ERROR(
+-      validate_and_configure_window(input0->clone().get(), input1->clone().get(),
+-                                    output->clone().get(), gemm_info, num_elements_processed)
+-          .first);
+-
+-  return Status{};
+-}
+-
+-void CLGEMMLowpMatrixMultiplyKernelEx::run(const Window &window, cl::CommandQueue &queue)
+-{
+-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+-
+-  if (_input1->info()->num_dimensions() < 3)
+-  {
+-    // The stride_z for matrix B must be zero if we do not slice
+-    ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);
+-  }
+-
+-  Window slice = window.first_slice_window_3D();
+-  Window slice_matrix_b = slice;
+-
+-  slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
+-  slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
+-
+-  if (_reinterpret_input_as_3d)
+-  {
+-    // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
+-    const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3;
+-    const unsigned int total_cross_plane_pad =
+-        _input0->info()->padding().top + _input0->info()->padding().bottom;
+-    _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
+-  }
+-
+-  if (_reinterpret_output_as_3d)
+-  {
+-    // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
+-    const unsigned int idx0 =
+-        3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
+-    const unsigned int total_cross_plane_pad =
+-        _output->info()->padding().top + _output->info()->padding().bottom;
+-    _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
+-  }
+-
+-  do
+-  {
+-    Window slice_b = slice;
+-    // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A
+-    // more than 2
+-    // This scenario can happen when the matrix multiplication is used to perform a convolution
+-    // operation
+-    if (!_slide_matrix_b)
+-    {
+-      slice_b = slice_matrix_b;
+-    }
+-
+-    unsigned int idx = 0;
+-    add_2D_tensor_argument(idx, _input0, slice);
+-    add_2D_tensor_argument(idx, _input1, slice_b);
+-    add_2D_tensor_argument(idx, _output, slice);
+-    _kernel.setArg<cl_uint>(idx++,
+-                            static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
+-    _kernel.setArg<cl_uint>(idx++,
+-                            static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
+-    _kernel.setArg<cl_uint>(idx++,
+-                            static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
+-    enqueue(queue, *this, slice, lws_hint());
+-  } while (window.slide_window_slice_3D(slice));
+-}
+diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp
+index 3a25987..3bfe3e4 100644
+--- a/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp
++++ b/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp
+@@ -45,6 +45,7 @@
+ #include "arm_compute/core/CL/ICLTensor.h"
+ #include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
+ #include "arm_compute/core/UtilsEx.h"
++#include "support/StringSupport.h"
+ 
+ using namespace arm_compute;
+ 
+diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
+index 7fbdcda..930e7c9 100644
+--- a/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
++++ b/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
+@@ -43,6 +43,7 @@
+ #include "arm_compute/core/CL/CLHelpers.h"
+ #include "arm_compute/core/CL/CLKernelLibraryEx.h"
+ #include "arm_compute/core/CL/ICLTensor.h"
++#include "support/StringSupport.h"
+ 
+ using namespace arm_compute;
+ 
+@@ -110,7 +111,7 @@ void CLHashtableLookupKernel::configure(const ICLTensor *lookups, const ICLTenso
+   _hits = hits;
+ 
+   // Make _lookup_indices tensor
+-  _lookup_indices = arm_compute::support::cpp14::make_unique<CLTensor>();
++  _lookup_indices = support::cpp14::make_unique<CLTensor>();
+   _lookup_indices->allocator()->init(
+       TensorInfo(lookups->info()->tensor_shape(), lookups->info()->num_channels(), DataType::S32));
+   _lookup_indices->allocator()->allocate();
+diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp
+index b45f6bb..61c14d2 100644
+--- a/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp
++++ b/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp
+@@ -48,7 +48,7 @@
+ #include "arm_compute/core/TensorInfo.h"
+ #include "arm_compute/core/Utils.h"
+ #include "arm_compute/core/Window.h"
+-
++#include "support/StringSupport.h"
+ #include "support/ToolchainSupport.h"
+ 
+ namespace arm_compute
+diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp
+index d305896..6b27c99 100644
+--- a/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp
++++ b/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp
+@@ -49,6 +49,7 @@
+ #include "arm_compute/core/Utils.h"
+ #include "arm_compute/core/Validate.h"
+ #include "arm_compute/core/Window.h"
++#include "support/StringSupport.h"
+ 
+ using namespace arm_compute;
+ 
+diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
+index 74f7b41..643c8b1 100644
+--- a/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
++++ b/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
+@@ -43,6 +43,7 @@
+ #include "arm_compute/core/CL/CLHelpers.h"
+ #include "arm_compute/core/CL/CLKernelLibraryEx.h"
+ #include "arm_compute/core/CL/ICLTensor.h"
++#include "support/StringSupport.h"
+ 
+ using namespace arm_compute;
+ 
+diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp
+deleted file mode 100644
+index 8910a7b..0000000
+--- a/compute/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp
++++ /dev/null
+@@ -1,210 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2016-2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/core/CL/kernels/CLPReLUKernel.h"
+-
+-#include "arm_compute/core/CL/CLHelpers.h"
+-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+-#include "arm_compute/core/CL/ICLTensor.h"
+-
+-using namespace arm_compute;
+-
+-namespace
+-{
+-constexpr unsigned int num_elems_processed_per_iteration = 16;
+-
+-Status validate_info(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output)
+-{
+-  const TensorShape &out_shape =
+-      TensorShape::broadcast_shape(input->tensor_shape(), alpha->tensor_shape());
+-
+-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32,
+-                                                       DataType::QASYMM8);
+-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(alpha, 1, DataType::F16, DataType::F32,
+-                                                       DataType::QASYMM8);
+-
+-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
+-                                  "Inputs are not broadcast compatible");
+-  // Validate in case of configured output
+-  if (output->total_size() > 0)
+-  {
+-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32,
+-                                                         DataType::QASYMM8);
+-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+-        detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
+-        "Wrong shape for output");
+-  }
+-  return Status{};
+-}
+-} // namespace
+-
+-CLPReLUKernel::CLPReLUKernel() : _input(nullptr), _alpha(nullptr), _output(nullptr) {}
+-
+-void CLPReLUKernel::configure(const ICLTensor *input, const ICLTensor *alpha, ICLTensor *output)
+-{
+-  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, alpha);
+-  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+-  ARM_COMPUTE_ERROR_THROW_ON(validate_info(input->info(), alpha->info(), output->info()));
+-
+-  _input = input;
+-  _alpha = alpha;
+-  _output = output;
+-
+-  // Create kernel
+-  std::string kernel_name = "prelu";
+-  std::set<std::string> build_opts;
+-  build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+-  build_opts.emplace(
+-      ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+-
+-  if (is_data_type_quantized_asymmetric(input->info()->data_type()))
+-  {
+-    build_opts.emplace("-DOFF_IN=" + support::cpp11::to_string(
+-                                         input->info()->quantization_info().uniform().offset));
+-    build_opts.emplace("-DOFF_ALPHA=" + support::cpp11::to_string(
+-                                            alpha->info()->quantization_info().uniform().offset));
+-    build_opts.emplace("-DOFF_OUT=" + support::cpp11::to_string(
+-                                          output->info()->quantization_info().uniform().offset));
+-    build_opts.emplace("-DSCALE_IN=" + support::cpp11::to_string(
+-                                           input->info()->quantization_info().uniform().scale));
+-    build_opts.emplace("-DSCALE_ALPHA=" + support::cpp11::to_string(
+-                                              alpha->info()->quantization_info().uniform().scale));
+-    build_opts.emplace("-DSCALE_OUT=" + support::cpp11::to_string(
+-                                            output->info()->quantization_info().uniform().scale));
+-    kernel_name += "_qasymm8";
+-  }
+-  _kernel =
+-      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+-
+-  const std::pair<TensorShape, ValidRegion> broadcast_pair =
+-      ITensorInfo::broadcast_shape_and_valid_region(*input->info(), *alpha->info());
+-
+-  const TensorShape &out_shape = broadcast_pair.first;
+-  const ValidRegion &valid_region = broadcast_pair.second;
+-
+-  // Auto initialize output if not initialized
+-  {
+-    set_shape_if_empty(*output->info(), out_shape);
+-
+-    if (input->info()->data_type() == DataType::F16 && alpha->info()->data_type() == DataType::F16)
+-    {
+-      set_format_if_unknown(*output->info(), Format::F16);
+-    }
+-    else if (input->info()->data_type() == DataType::F32 ||
+-             alpha->info()->data_type() == DataType::F32)
+-    {
+-      set_format_if_unknown(*output->info(), Format::F32);
+-    }
+-  }
+-
+-  Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
+-  Window win_input1 = win.broadcast_if_dimension_le_one(*input->info());
+-  Window win_input2 = win.broadcast_if_dimension_le_one(*alpha->info());
+-
+-  AccessWindowHorizontal input1_access(input->info(), 0, num_elems_processed_per_iteration);
+-  AccessWindowHorizontal input2_access(alpha->info(), 0, num_elems_processed_per_iteration);
+-  AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+-
+-  update_window_and_padding(win_input1, input1_access) ||
+-      update_window_and_padding(win_input2, input2_access) ||
+-      update_window_and_padding(win, output_access);
+-
+-  output_access.set_valid_region(win, valid_region);
+-
+-  ICLKernel::configure_internal(win);
+-}
+-
+-void CLPReLUKernel::run(const Window &window, cl::CommandQueue &queue)
+-{
+-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+-
+-  const TensorShape &in_shape1 = _input->info()->tensor_shape();
+-  const TensorShape &in_shape2 = _alpha->info()->tensor_shape();
+-  const TensorShape &out_shape = _output->info()->tensor_shape();
+-
+-  bool can_collapse = true;
+-  if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
+-  {
+-    can_collapse =
+-        (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
+-    for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
+-    {
+-      can_collapse = (in_shape1[d] == in_shape2[d]);
+-    }
+-  }
+-
+-  bool has_collapsed = false;
+-  Window collapsed =
+-      can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
+-                   : window;
+-
+-  const TensorShape &in_shape1_collapsed =
+-      has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
+-  const TensorShape &in_shape2_collapsed =
+-      has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
+-
+-  Window slice = collapsed.first_slice_window_3D();
+-  Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
+-  Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
+-
+-  do
+-  {
+-    unsigned int idx = 0;
+-    add_3D_tensor_argument(idx, _input, slice_input1);
+-    add_3D_tensor_argument(idx, _alpha, slice_input2);
+-    add_3D_tensor_argument(idx, _output, slice);
+-
+-    enqueue(queue, *this, slice);
+-
+-    collapsed.slide_window_slice_3D(slice_input1);
+-    collapsed.slide_window_slice_3D(slice_input2);
+-  } while (collapsed.slide_window_slice_3D(slice));
+-}
+-
+-BorderSize CLPReLUKernel::border_size() const
+-{
+-  const unsigned int replicateSize =
+-      _output->info()->dimension(0) -
+-      std::min(_input->info()->dimension(0), _alpha->info()->dimension(0));
+-  const unsigned int border =
+-      std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
+-  return BorderSize(0, border, 0, 0);
+-}
+diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp
+index 2d551f6..1a7a18c 100644
+--- a/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp
++++ b/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp
+@@ -49,6 +49,7 @@
+ #include "arm_compute/core/Utils.h"
+ #include "arm_compute/core/Validate.h"
+ #include "arm_compute/core/Window.h"
++#include "support/StringSupport.h"
+ 
+ namespace arm_compute
+ {
+@@ -69,7 +70,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *scale_fac
+ 
+   // Output must always be initialized
+   ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape().total_size() == 0);
+-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S8);
++  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8_SIGNED);
+   ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ 
+   return Status{};
+diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
+index a983183..06c2579 100644
+--- a/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
++++ b/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
+@@ -43,6 +43,7 @@
+ #include "arm_compute/core/CL/CLHelpers.h"
+ #include "arm_compute/core/CL/CLKernelLibraryEx.h"
+ #include "arm_compute/core/CL/ICLTensor.h"
++#include "support/StringSupport.h"
+ 
+ using namespace arm_compute;
+ namespace
+diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp
+index ff1904a..8d8853c 100644
+--- a/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp
++++ b/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp
+@@ -48,6 +48,7 @@
+ #include "arm_compute/core/Validate.h"
+ #include "arm_compute/core/Window.h"
+ #include "arm_compute/core/utils/misc/ShapeCalculator.h"
++#include "support/StringSupport.h"
+ 
+ #include <climits>
+ 
+diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp
+deleted file mode 100644
+index 64fc038..0000000
+--- a/compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp
++++ /dev/null
+@@ -1,148 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2016-2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h"
+-
+-#include "arm_compute/core/CL/CLHelpers.h"
+-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+-#include "arm_compute/core/CL/ICLTensor.h"
+-
+-using namespace arm_compute;
+-
+-namespace
+-{
+-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+-                          const int32_t block_size)
+-{
+-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
+-                                                DataType::S16, DataType::S32, DataType::F16,
+-                                                DataType::F32);
+-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
+-                                                DataType::S16, DataType::S32, DataType::F16,
+-                                                DataType::F32);
+-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size < 1,
+-                                  "Block size should be greater than or equal to 1.");
+-
+-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(3) != output->dimension(3),
+-                                  "Input batch should be equal to Output batch");
+-
+-  auto layout_out = input->data_layout();
+-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
+-
+-  auto index_depth = get_data_layout_dimension_index(layout_out, DataLayoutDimension::CHANNEL);
+-  auto index_height = get_data_layout_dimension_index(layout_out, DataLayoutDimension::HEIGHT);
+-  auto index_width = get_data_layout_dimension_index(layout_out, DataLayoutDimension::WIDTH);
+-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+-      input->dimension(index_depth) * block_size * block_size != output->dimension(index_depth),
+-      "Output depth should be equal to (input depth * block size *block size)");
+-
+-  ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->dimension(index_width) % block_size) ||
+-                                      (input->dimension(index_height) % block_size),
+-                                  "Input height and width should be divisible by block size");
+-
+-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+-      (output->dimension(index_width) != (input->dimension(index_width) / block_size)) ||
+-          (output->dimension(index_height) != (input->dimension(index_height) / block_size)),
+-      "Output height and width should be equal to "
+-      "input_height/blocksize and input_width/blocksize respectively");
+-
+-  return Status{};
+-}
+-
+-} // namespace
+-
+-CLSpaceToDepthKernel::CLSpaceToDepthKernel() : _input(nullptr), _output(nullptr) {}
+-
+-void CLSpaceToDepthKernel::configure(const ICLTensor *input, ICLTensor *output,
+-                                     const int32_t block_size)
+-{
+-
+-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_size));
+-
+-  _input = input;
+-  _output = output;
+-
+-  // Set kernel build options
+-  auto layout_out = input->info()->data_layout();
+-  std::set<std::string> build_opts;
+-  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+-  build_opts.emplace("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size));
+-  auto index_depth = get_data_layout_dimension_index(layout_out, DataLayoutDimension::CHANNEL);
+-  auto depth = input->info()->dimension(index_depth);
+-  build_opts.emplace("-DDEPTH_IN=" + support::cpp11::to_string(depth));
+-  build_opts.emplace("-DZ_IN=" + support::cpp11::to_string(input->info()->tensor_shape().z()));
+-
+-  // Create kernel
+-  _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(
+-      "space_to_depth_" + lower_string(string_from_data_layout(layout_out)), build_opts));
+-
+-  // Configure  kernel window
+-  Window win = calculate_max_window(*input->info(), Steps());
+-
+-  Coordinates coord;
+-  coord.set_num_dimensions(output->info()->num_dimensions());
+-  output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+-
+-  ICLKernel::configure_internal(win);
+-}
+-
+-void CLSpaceToDepthKernel::run(const Window &window, cl::CommandQueue &queue)
+-{
+-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+-  ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+-
+-  Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+-
+-  // Setup output slice
+-  Window slice_out(slice_in);
+-  slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+-  slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+-  slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
+-  slice_out.set(3, Window::Dimension(0, 0, 0));
+-
+-  do
+-  {
+-    unsigned int idx = 0;
+-    add_4D_tensor_argument(idx, _input, slice_in);
+-    add_4D_tensor_argument(idx, _output, slice_out);
+-    enqueue(queue, *this, slice_in);
+-  } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
+-}
+diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.cpp
+deleted file mode 100644
+index 61999cb..0000000
+--- a/compute/ARMComputeEx/src/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.cpp
++++ /dev/null
+@@ -1,188 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2017-2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h"
+-
+-#include "arm_compute/core/CL/CLHelpers.h"
+-#include "arm_compute/core/CL/CLKernelLibrary.h"
+-#include "arm_compute/core/CL/CLValidate.h"
+-#include "arm_compute/core/CL/ICLTensor.h"
+-#include "arm_compute/core/Error.h"
+-#include "arm_compute/core/Helpers.h"
+-#include "arm_compute/core/Validate.h"
+-#include "arm_compute/core/Window.h"
+-
+-using namespace arm_compute;
+-
+-CLTransposeConvLayerUpsampleKernel::CLTransposeConvLayerUpsampleKernel()
+-    : _input(nullptr), _output(nullptr), _inner_border(), _info()
+-{
+-}
+-
+-Status CLTransposeConvLayerUpsampleKernel::validate(const ITensorInfo *input,
+-                                                    const ITensorInfo *output,
+-                                                    const BorderSize &inner_border,
+-                                                    const PadStrideInfo &info)
+-{
+-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+-  ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
+-                                                       DataType::F32);
+-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+-
+-  const DataLayout data_layout = input->data_layout();
+-
+-  const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+-  const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+-  const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+-
+-  ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_w) == 0);
+-  ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_h) == 0);
+-
+-  ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_c) != output->dimension(idx_c));
+-  for (size_t i = 3; i < Coordinates::num_max_dimensions; ++i)
+-  {
+-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(i) != output->dimension(i));
+-  }
+-
+-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border.right > info.stride().first - 1,
+-                                  "inner_border_right must be smaller that stride_x");
+-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border.top > info.stride().second - 1,
+-                                  "inner_border_top must be smaller that stride_y");
+-
+-  return Status{};
+-}
+-
+-void CLTransposeConvLayerUpsampleKernel::configure(const ICLTensor *input, ICLTensor *output,
+-                                                   const BorderSize &inner_border,
+-                                                   const PadStrideInfo &info)
+-{
+-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+-
+-  _input = input;
+-  _output = output;
+-  _inner_border = inner_border;
+-  _info = info;
+-
+-  // Perform validation step
+-  ARM_COMPUTE_ERROR_THROW_ON(CLTransposeConvLayerUpsampleKernel::validate(
+-      input->info(), output->info(), inner_border, info));
+-
+-  // Create kernel
+-  CLBuildOptions build_opts;
+-  build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+-  _kernel = static_cast<cl::Kernel>(
+-      CLKernelLibrary::get().create_kernel("deconvolution_upsample", build_opts.options()));
+-
+-  constexpr unsigned int num_elems_processed_per_iteration = 1;
+-
+-  // Configure kernel window
+-  Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+-  AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+-  output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+-
+-  ICLKernel::configure_internal(win);
+-}
+-
+-void CLTransposeConvLayerUpsampleKernel::run(const Window &window, cl::CommandQueue &queue)
+-{
+-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+-
+-  const DataLayout data_layout = _input->info()->data_layout();
+-
+-  const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+-  const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+-
+-  const int out_start_x = _info.pad_left();
+-  const int out_end_x = _output->info()->dimension(idx_w) - _inner_border.right -
+-                        _info.pad_right() + _info.stride().first - 1;
+-  const int out_step_x = _info.stride().first;
+-
+-  const int out_start_y = _inner_border.top + _info.pad_top();
+-  const int out_end_y =
+-      _output->info()->dimension(idx_h) - _info.pad_bottom() + _info.stride().second - 1;
+-  const int out_step_y = _info.stride().second;
+-
+-  switch (data_layout)
+-  {
+-    case DataLayout::NCHW:
+-    {
+-      Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+-
+-      Window slice_out = collapsed.first_slice_window_3D();
+-      slice_out.set(Window::DimX, Window::Dimension(out_start_x, out_end_x, out_step_x));
+-      slice_out.set(Window::DimY, Window::Dimension(out_start_y, out_end_y, out_step_y));
+-
+-      Window slice_in = collapsed.first_slice_window_3D();
+-
+-      do
+-      {
+-        unsigned int idx = 0;
+-        add_3D_tensor_argument(idx, _input, slice_in);
+-        add_3D_tensor_argument(idx, _output, slice_out);
+-        enqueue(queue, *this, slice_out);
+-      } while (collapsed.slide_window_slice_3D(slice_in) &&
+-               collapsed.slide_window_slice_3D(slice_out));
+-      break;
+-    }
+-    case DataLayout::NHWC:
+-    {
+-      // NOTE: not collapsing in NHWC
+-      Window slice_out = window.first_slice_window_3D();
+-      slice_out.set(Window::DimY, Window::Dimension(out_start_x, out_end_x, out_step_x));
+-      slice_out.set(Window::DimZ, Window::Dimension(out_start_y, out_end_y, out_step_y));
+-
+-      Window slice_in = window.first_slice_window_3D();
+-
+-      do
+-      {
+-        unsigned int idx = 0;
+-        add_3D_tensor_argument(idx, _input, slice_in);
+-        add_3D_tensor_argument(idx, _output, slice_out);
+-        enqueue(queue, *this, slice_out);
+-      } while (window.slide_window_slice_3D(slice_in) && window.slide_window_slice_3D(slice_out));
+-      break;
+-    }
+-    default:
+-      ARM_COMPUTE_ERROR("Unsupported data layout");
+-  }
+-}
+diff --git a/compute/ARMComputeEx/src/core/CPP/kernels/CPPUpsampleKernelEx.cpp b/compute/ARMComputeEx/src/core/CPP/kernels/CPPUpsampleKernelEx.cpp
+deleted file mode 100644
+index 648afb3..0000000
+--- a/compute/ARMComputeEx/src/core/CPP/kernels/CPPUpsampleKernelEx.cpp
++++ /dev/null
+@@ -1,118 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2017-2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h"
+-
+-#include "arm_compute/core/Error.h"
+-#include "arm_compute/core/Helpers.h"
+-#include "arm_compute/core/ITensor.h"
+-#include "arm_compute/core/TensorInfo.h"
+-#include "arm_compute/core/Types.h"
+-#include "arm_compute/core/Validate.h"
+-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+-
+-#include <cstddef>
+-#include <cstdint>
+-
+-namespace arm_compute
+-{
+-CPPUpsampleKernelEx::CPPUpsampleKernelEx() : _input(nullptr), _output(nullptr), _info() {}
+-
+-bool CPPUpsampleKernelEx::is_parallelisable() const { return false; }
+-
+-void CPPUpsampleKernelEx::configure(const ITensor *input, ITensor *output,
+-                                    const PadStrideInfo &info)
+-{
+-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+-
+-  _input = input;
+-  _output = output;
+-  _info = info;
+-
+-  // Configure kernel window
+-  Window win = calculate_max_window(*input->info(), Steps());
+-
+-  // The CPPUpsampleKernelEx doesn't need padding so update_window_and_padding() can be skipped
+-  Coordinates coord;
+-  coord.set_num_dimensions(output->info()->num_dimensions());
+-  output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+-
+-  ICPPKernel::configure(win);
+-}
+-
+-void CPPUpsampleKernelEx::run(const Window &window, const ThreadInfo &info)
+-{
+-  ARM_COMPUTE_UNUSED(info);
+-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
+-
+-  // Initialize _scaled_output buffer
+-  const int width_scaled = _output->info()->dimension(0);
+-  const int height_scaled = _output->info()->dimension(1);
+-  const int stride_x = _info.stride().first;
+-  const int stride_y = _info.stride().second;
+-  const int start_x = _info.pad_left();
+-  const int start_y = _info.pad_top();
+-  const int end_y = height_scaled - _info.pad_bottom();
+-  const int end_x = width_scaled - _info.pad_top();
+-  const size_t element_size = _input->info()->element_size();
+-
+-  // The fill value is normally 0, but for QASYMM8 the '0' corresponds to the offset
+-  const uint8_t fill_value =
+-      _output->info()->data_type() == DataType::QASYMM8
+-          ? utility::clamp<uint8_t>(_output->info()->quantization_info().uniform().offset)
+-          : 0;
+-  // Filling a value different than 0 works only for QASYMM8 datatype since we are filling 1byte
+-  // values in a buffer of uint8_ts
+-  std::fill_n(_output->buffer(), _output->info()->total_size(), fill_value);
+-
+-  // Create window
+-  Window window_out(window);
+-  window_out.set(Window::DimX, Window::Dimension(start_x, end_x, stride_x));
+-  window_out.set(Window::DimY, Window::Dimension(start_y, end_y, stride_y));
+-
+-  // Create iterators
+-  Iterator in(_input, window);
+-  Iterator out(_output, window_out);
+-
+-  execute_window_loop(
+-      window, [&](const Coordinates &) { memcpy(out.ptr(), in.ptr(), element_size); }, in, out);
+-}
+-} // namespace arm_compute
+diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NECastKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NECastKernel.cpp
+deleted file mode 100644
+index fbb9dbc..0000000
+--- a/compute/ARMComputeEx/src/core/NEON/kernels/NECastKernel.cpp
++++ /dev/null
+@@ -1,671 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2017-2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/core/NEON/kernels/NECastKernel.h"
+-
+-#include "arm_compute/core/AccessWindowStatic.h"
+-#include "arm_compute/core/CPP/Validate.h"
+-#include "arm_compute/core/Error.h"
+-#include "arm_compute/core/Helpers.h"
+-#include "arm_compute/core/NEON/NEAsymm.h"
+-#include "arm_compute/core/NEON/wrapper/wrapper.h"
+-#include "arm_compute/core/Utils.h"
+-#include "arm_compute/core/Validate.h"
+-#include "arm_compute/core/Window.h"
+-
+-#include <arm_neon.h>
+-
+-namespace arm_compute
+-{
+-namespace
+-{
+-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+-                          SubDataType input_subtype)
+-{
+-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8,
+-                                                       DataType::QASYMM8, DataType::U32,
+-                                                       DataType::S32, DataType::F32);
+-  ARM_COMPUTE_RETURN_ERROR_ON(input_subtype == SubDataType::BOOL &&
+-                              input->data_type() != DataType::U8);
+-
+-  if (output->tensor_shape().total_size() > 0)
+-  {
+-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(output);
+-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S8,
+-                                                         DataType::QASYMM8, DataType::U32,
+-                                                         DataType::S32, DataType::F32);
+-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+-  }
+-
+-  return Status{};
+-}
+-
+-std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+-{
+-  // Configure kernel window
+-  Window win = calculate_max_window(*input, Steps());
+-
+-  // Output tensor auto initialization if not yet initialized
+-  auto_init_if_empty(*output, input->tensor_shape(), 1, DataType::F32);
+-
+-  // NECastKernel doesn't need padding so update_window_and_padding() can be skipped
+-  Coordinates coord;
+-  coord.set_num_dimensions(output->num_dimensions());
+-  output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
+-
+-  return std::make_tuple(Status{}, win);
+-}
+-
+-typedef struct bool8x16
+-{
+-  uint8x16_t val;
+-} bool8x16_t;
+-
+-static inline uint8x16_t vreinterpretq_u8_b8(bool8x16_t __a) { return (uint8x16_t)__a.val; }
+-
+-template <typename ToV, typename FromV> inline ToV vcast(const FromV &v) { return v; }
+-template <> inline uint8x16_t vcast(const bool8x16_t &v)
+-{
+-  const uint8x16_t vu8 = vreinterpretq_u8_b8(v);
+-  const uint8x16_t zero_uint8x16 = vdupq_n_u8(0);
+-  uint8x16_t mask = vcgtq_u8(vu8, zero_uint8x16);
+-  return vshrq_n_u8(mask, 7); // true -> 1, false -> 0
+-}
+-
+-template <> inline uint32x4x4_t vcast(const bool8x16_t &v)
+-{
+-  const uint8x16_t vu8 = vreinterpretq_u8_b8(v);
+-  const uint8x16_t zero_uint8x16 = vdupq_n_u8(0);
+-  uint8x16_t mask = vcgtq_u8(vu8, zero_uint8x16);
+-  uint8x16_t vb = vshrq_n_u8(mask, 7); // true -> 1, false -> 0
+-
+-  const uint32x4x4_t ret = {{
+-      vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vb)))),
+-      vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(vb)))),
+-      vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(vb)))),
+-      vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(vb)))),
+-  }};
+-
+-  return ret;
+-}
+-
+-template <> inline int32x4x4_t vcast(const bool8x16_t &v)
+-{
+-  const uint8x16_t vu8 = vreinterpretq_u8_b8(v);
+-  const uint8x16_t zero_uint8x16 = vdupq_n_u8(0);
+-  uint8x16_t mask = vcgtq_u8(vu8, zero_uint8x16);
+-  uint8x16_t vb = vshrq_n_u8(mask, 7); // true -> 1, false -> 0
+-
+-  const int32x4x4_t ret = {{
+-      vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vb))))),
+-      vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(vb))))),
+-      vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(vb))))),
+-      vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(vb))))),
+-  }};
+-
+-  return ret;
+-}
+-
+-template <> inline float32x4x4_t vcast(const bool8x16_t &v)
+-{
+-  const uint8x16_t vu8 = vreinterpretq_u8_b8(v);
+-  const uint8x16_t zero_uint8x16 = vdupq_n_u8(0);
+-  uint8x16_t mask = vcgtq_u8(vu8, zero_uint8x16);
+-  uint8x16_t vb = vshrq_n_u8(mask, 7); // true -> 1, false -> 0
+-
+-  const float32x4x4_t ret = {{
+-      vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vb))))),
+-      vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(vb))))),
+-      vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(vb))))),
+-      vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(vb))))),
+-  }};
+-
+-  return ret;
+-}
+-
+-template <> inline uint32x4x4_t vcast(const uint8x16_t &v)
+-{
+-  const uint32x4x4_t ret = {{
+-      vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(v)))),
+-      vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(v)))),
+-      vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(v)))),
+-      vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(v)))),
+-  }};
+-
+-  return ret;
+-}
+-
+-template <> inline int32x4x4_t vcast(const uint8x16_t &v)
+-{
+-  const int32x4x4_t ret = {{
+-      vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(v))))),
+-      vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(v))))),
+-      vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(v))))),
+-      vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(v))))),
+-  }};
+-
+-  return ret;
+-}
+-
+-template <> inline float32x4x4_t vcast(const uint8x16_t &v)
+-{
+-  const float32x4x4_t ret = {{
+-      vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(v))))),
+-      vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(v))))),
+-      vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(v))))),
+-      vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(v))))),
+-  }};
+-
+-  return ret;
+-}
+-
+-template <> inline uint8x16_t vcast(const int32x4x4_t &v)
+-{
+-  // Saturate cast
+-  return vcombine_u8(vqmovn_u16(vcombine_u16(vqmovun_s32(v.val[0]), vqmovun_s32(v.val[1]))),
+-                     vqmovn_u16(vcombine_u16(vqmovun_s32(v.val[2]), vqmovun_s32(v.val[3]))));
+-}
+-
+-template <> inline uint32x4x4_t vcast(const int32x4x4_t &v)
+-{
+-  // Saturate cast
+-  const uint32x4x4_t ret = {{
+-      vcombine_u32(vqmovun_s64(vmovl_s32(vget_low_s32(v.val[0]))),
+-                   vqmovun_s64(vmovl_s32(vget_high_s32(v.val[0])))),
+-      vcombine_u32(vqmovun_s64(vmovl_s32(vget_low_s32(v.val[1]))),
+-                   vqmovun_s64(vmovl_s32(vget_high_s32(v.val[1])))),
+-      vcombine_u32(vqmovun_s64(vmovl_s32(vget_low_s32(v.val[2]))),
+-                   vqmovun_s64(vmovl_s32(vget_high_s32(v.val[2])))),
+-      vcombine_u32(vqmovun_s64(vmovl_s32(vget_low_s32(v.val[3]))),
+-                   vqmovun_s64(vmovl_s32(vget_high_s32(v.val[3])))),
+-  }};
+-
+-  return ret;
+-}
+-
+-template <> inline float32x4x4_t vcast(const int32x4x4_t &v)
+-{
+-  const float32x4x4_t ret = {{
+-      vcvtq_f32_s32(v.val[0]), vcvtq_f32_s32(v.val[1]), vcvtq_f32_s32(v.val[2]),
+-      vcvtq_f32_s32(v.val[3]),
+-  }};
+-
+-  return ret;
+-}
+-
+-template <> inline uint8x16_t vcast(const uint32x4x4_t &v)
+-{
+-  return vcombine_u8(vqmovn_u16(vcombine_u16(vqmovn_u32(v.val[0]), vqmovn_u32(v.val[1]))),
+-                     vqmovn_u16(vcombine_u16(vqmovn_u32(v.val[2]), vqmovn_u32(v.val[3]))));
+-}
+-
+-template <> inline int32x4x4_t vcast(const uint32x4x4_t &v)
+-{
+-  const int32x4x4_t ret = {{
+-      vcombine_s32(vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_low_u32(v.val[0])))),
+-                   vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_high_u32(v.val[0]))))),
+-      vcombine_s32(vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_low_u32(v.val[1])))),
+-                   vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_high_u32(v.val[1]))))),
+-      vcombine_s32(vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_low_u32(v.val[2])))),
+-                   vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_high_u32(v.val[2]))))),
+-      vcombine_s32(vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_low_u32(v.val[3])))),
+-                   vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_high_u32(v.val[3]))))),
+-  }};
+-
+-  return ret;
+-}
+-
+-template <> inline float32x4x4_t vcast(const uint32x4x4_t &v)
+-{
+-  const float32x4x4_t ret = {{
+-      vcvtq_f32_u32(v.val[0]), vcvtq_f32_u32(v.val[1]), vcvtq_f32_u32(v.val[2]),
+-      vcvtq_f32_u32(v.val[3]),
+-  }};
+-
+-  return ret;
+-}
+-
+-template <> inline uint8x16_t vcast(const float32x4x4_t &v)
+-{
+-  // Saturate cast
+-  return vcombine_u8(vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(v.val[0])),
+-                                             vqmovun_s32(vcvtq_s32_f32(v.val[1])))),
+-                     vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(v.val[2])),
+-                                             vqmovun_s32(vcvtq_s32_f32(v.val[3])))));
+-}
+-
+-template <> inline uint32x4x4_t vcast(const float32x4x4_t &v)
+-{
+-  const uint32x4x4_t ret = {{
+-      vcvtq_u32_f32(v.val[0]), vcvtq_u32_f32(v.val[1]), vcvtq_u32_f32(v.val[2]),
+-      vcvtq_u32_f32(v.val[3]),
+-  }};
+-
+-  return ret;
+-}
+-
+-template <> inline int32x4x4_t vcast(const float32x4x4_t &v)
+-{
+-  const int32x4x4_t ret = {{
+-      vcvtq_s32_f32(v.val[0]), vcvtq_s32_f32(v.val[1]), vcvtq_s32_f32(v.val[2]),
+-      vcvtq_s32_f32(v.val[3]),
+-  }};
+-
+-  return ret;
+-}
+-
+-template <typename T> struct cast_vector;
+-template <> struct cast_vector<bool>
+-{
+-  using type = bool8x16_t;
+-};
+-template <> struct cast_vector<uint8_t>
+-{
+-  using type = uint8x16_t;
+-};
+-template <> struct cast_vector<uint32_t>
+-{
+-  using type = uint32x4x4_t;
+-};
+-template <> struct cast_vector<int32_t>
+-{
+-  using type = int32x4x4_t;
+-};
+-template <> struct cast_vector<float>
+-{
+-  using type = float32x4x4_t;
+-};
+-
+-template <typename T> inline void store_result(T *ptr, const typename cast_vector<T>::type &v)
+-{
+-  wrapper::vstore(ptr, v.val[0]);
+-  wrapper::vstore(ptr + 4, v.val[1]);
+-  wrapper::vstore(ptr + 8, v.val[2]);
+-  wrapper::vstore(ptr + 12, v.val[3]);
+-}
+-
+-template <> inline void store_result<uint8_t>(uint8_t *ptr, const uint8x16_t &v)
+-{
+-  wrapper::vstore(ptr, v);
+-}
+-
+-inline bool8x16_t vloadq(const bool *ptr)
+-{
+-  bool8x16_t ret;
+-  ret.val = wrapper::vloadq(reinterpret_cast<const uint8_t *>(ptr));
+-  return ret;
+-}
+-
+-template <typename T> inline typename cast_vector<T>::type load_input(const T *ptr)
+-{
+-  return wrapper::vloadq(ptr);
+-}
+-
+-template <> inline typename cast_vector<bool>::type load_input(const bool *ptr)
+-{
+-  return vloadq(ptr);
+-}
+-
+-template <> inline typename cast_vector<uint32_t>::type load_input(const uint32_t *ptr)
+-{
+-  return vld4q_u32(ptr);
+-}
+-
+-template <> inline typename cast_vector<int32_t>::type load_input(const int32_t *ptr)
+-{
+-  return vld4q_s32(ptr);
+-}
+-
+-template <> inline typename cast_vector<float>::type load_input(const float *ptr)
+-{
+-  return vld4q_f32(ptr);
+-}
+-
+-template <typename T> inline T get_value(const T *ptr) { return *ptr; }
+-
+-template <> inline bool get_value(const bool *ptr)
+-{
+-  bool ret = (*ptr != 0);
+-  return ret;
+-}
+-
+-template <typename FromT> void run_cast(const ITensor *input, ITensor *output, const Window &window)
+-{
+-  const int window_step_x = 16;
+-  const auto window_start_x = static_cast<int>(window.x().start());
+-  const auto window_end_x = static_cast<int>(window.x().end());
+-
+-  // Collapse window and reset first dimension to handle tail calculations manually
+-  Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+-  win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+-
+-  // Create iterators
+-  Iterator in(input, win_collapsed);
+-  Iterator out(output, win_collapsed);
+-
+-#ifdef __aarch64__
+-  constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN;
+-#else  //__aarch64__
+-  constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO;
+-#endif //__aarch64__
+-
+-  execute_window_loop(
+-      win_collapsed,
+-      [&](const Coordinates &) {
+-        const auto in_ptr = reinterpret_cast<const FromT *>(in.ptr());
+-
+-        int x = window_start_x;
+-        for (; x <= (window_end_x - window_step_x); x += window_step_x)
+-        {
+-          using from_vector = typename cast_vector<FromT>::type;
+-          const from_vector vin = load_input(in_ptr + x);
+-
+-          switch (output->info()->data_type())
+-          {
+-            case DataType::U8:
+-            {
+-              using to_vector = typename cast_vector<uint8_t>::type;
+-              const to_vector vout = vcast<to_vector, from_vector>(vin);
+-              store_result<uint8_t>(reinterpret_cast<uint8_t *>(out.ptr()) + x, vout);
+-              break;
+-            }
+-            case DataType::QASYMM8:
+-            {
+-              using to_vector = typename cast_vector<float>::type;
+-              const UniformQuantizationInfo &qinfo_out =
+-                  output->info()->quantization_info().uniform();
+-              const auto vf = vcast<to_vector, from_vector>(vin);
+-              const auto vout = vquantize(vf, qinfo_out);
+-              store_result<qasymm8_t>(reinterpret_cast<qasymm8_t *>(out.ptr()) + x, vout);
+-              break;
+-            }
+-            case DataType::U32:
+-            {
+-              using to_vector = typename cast_vector<uint32_t>::type;
+-              const to_vector vout = vcast<to_vector, from_vector>(vin);
+-              store_result<uint32_t>(reinterpret_cast<uint32_t *>(out.ptr()) + x, vout);
+-              break;
+-            }
+-            case DataType::S32:
+-            {
+-              using to_vector = typename cast_vector<int32_t>::type;
+-              const to_vector vout = vcast<to_vector, from_vector>(vin);
+-              store_result<int32_t>(reinterpret_cast<int32_t *>(out.ptr()) + x, vout);
+-              break;
+-            }
+-            case DataType::F32:
+-            {
+-              using to_vector = typename cast_vector<float>::type;
+-              const to_vector vout = vcast<to_vector, from_vector>(vin);
+-              store_result<float>(reinterpret_cast<float *>(out.ptr()) + x, vout);
+-              break;
+-            }
+-            default:
+-              ARM_COMPUTE_ERROR("Unsupported data type.");
+-          }
+-        }
+-
+-        // Compute left-over elements
+-        for (; x < window_end_x; ++x)
+-        {
+-          FromT val = get_value(in_ptr + x);
+-          switch (output->info()->data_type())
+-          {
+-            case DataType::U8:
+-            {
+-              *(reinterpret_cast<uint8_t *>(out.ptr()) + x) = static_cast<uint8_t>(val);
+-              break;
+-            }
+-            case DataType::QASYMM8:
+-            {
+-              const QuantizationInfo &qinfo_out = output->info()->quantization_info();
+-              const auto qval =
+-                  quantize_qasymm8(static_cast<float>(val), qinfo_out, rounding_policy);
+-              *(reinterpret_cast<qasymm8_t *>(out.ptr()) + x) = qval;
+-              break;
+-            }
+-            case DataType::U32:
+-            {
+-              *(reinterpret_cast<uint32_t *>(out.ptr()) + x) = static_cast<uint32_t>(val);
+-              break;
+-            }
+-            case DataType::S32:
+-            {
+-              *(reinterpret_cast<int32_t *>(out.ptr()) + x) = static_cast<int32_t>(val);
+-              break;
+-            }
+-            case DataType::F32:
+-            {
+-              *(reinterpret_cast<float *>(out.ptr()) + x) = static_cast<float>(val);
+-              break;
+-            }
+-            default:
+-              ARM_COMPUTE_ERROR("Unsupported data type.");
+-          }
+-        }
+-      },
+-      in, out);
+-}
+-
+-void run_cast_qasymm8(const ITensor *input, ITensor *output, const Window &window)
+-{
+-  const int window_step_x = 16;
+-  const auto window_start_x = static_cast<int>(window.x().start());
+-  const auto window_end_x = static_cast<int>(window.x().end());
+-
+-  // Collapse window and reset first dimension to handle tail calculations manually
+-  Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+-  win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+-
+-  // Create iterators
+-  Iterator in(input, win_collapsed);
+-  Iterator out(output, win_collapsed);
+-
+-#ifdef __aarch64__
+-  constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN;
+-#else  //__aarch64__
+-  constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO;
+-#endif //__aarch64__
+-  const auto &qinfo_in = input->info()->quantization_info().uniform();
+-  const auto &qinfo_out = output->info()->quantization_info().uniform();
+-
+-  execute_window_loop(
+-      win_collapsed,
+-      [&](const Coordinates &) {
+-        const auto in_ptr = reinterpret_cast<const qasymm8_t *>(in.ptr());
+-
+-        int x = window_start_x;
+-        for (; x <= (window_end_x - window_step_x); x += window_step_x)
+-        {
+-          using from_vector = typename cast_vector<float>::type;
+-          const auto vf = wrapper::vloadq(in_ptr + x);
+-          const auto vin = vdequantize(vf, qinfo_in);
+-          switch (output->info()->data_type())
+-          {
+-            case DataType::U8:
+-            {
+-              using to_vector = typename cast_vector<uint8_t>::type;
+-              const to_vector vout = vcast<to_vector, from_vector>(vin);
+-              store_result<uint8_t>(reinterpret_cast<uint8_t *>(out.ptr()) + x, vout);
+-              break;
+-            }
+-            case DataType::QASYMM8:
+-            {
+-              using to_vector = typename cast_vector<float>::type;
+-              const auto vf = vcast<to_vector, from_vector>(vin);
+-              const auto vout = vquantize(vf, qinfo_out);
+-              store_result<qasymm8_t>(reinterpret_cast<qasymm8_t *>(out.ptr()) + x, vout);
+-              break;
+-            }
+-            case DataType::U32:
+-            {
+-              using to_vector = typename cast_vector<uint32_t>::type;
+-              const to_vector vout = vcast<to_vector, from_vector>(vin);
+-              store_result<uint32_t>(reinterpret_cast<uint32_t *>(out.ptr()) + x, vout);
+-              break;
+-            }
+-            case DataType::S32:
+-            {
+-              using to_vector = typename cast_vector<int32_t>::type;
+-              const to_vector vout = vcast<to_vector, from_vector>(vin);
+-              store_result<int32_t>(reinterpret_cast<int32_t *>(out.ptr()) + x, vout);
+-              break;
+-            }
+-            case DataType::F32:
+-            {
+-              using to_vector = typename cast_vector<float>::type;
+-              const to_vector vout = vcast<to_vector, from_vector>(vin);
+-              store_result<float>(reinterpret_cast<float *>(out.ptr()) + x, vout);
+-              break;
+-            }
+-            default:
+-              ARM_COMPUTE_ERROR("Unsupported data type.");
+-          }
+-        }
+-
+-        // Compute left-over elements
+-        for (; x < window_end_x; ++x)
+-        {
+-          qasymm8_t qval_in = *(in_ptr + x);
+-          const auto val = dequantize_qasymm8(qval_in, qinfo_in);
+-
+-          switch (output->info()->data_type())
+-          {
+-            case DataType::U8:
+-            {
+-              *(reinterpret_cast<uint8_t *>(out.ptr()) + x) = static_cast<uint8_t>(val);
+-              break;
+-            }
+-            case DataType::QASYMM8:
+-            {
+-              const auto qval_out = quantize_qasymm8(val, qinfo_out, rounding_policy);
+-              *(reinterpret_cast<qasymm8_t *>(out.ptr()) + x) = qval_out;
+-              break;
+-            }
+-            case DataType::U32:
+-            {
+-              *(reinterpret_cast<uint32_t *>(out.ptr()) + x) = static_cast<uint32_t>(val);
+-              break;
+-            }
+-            case DataType::S32:
+-            {
+-              *(reinterpret_cast<int32_t *>(out.ptr()) + x) = static_cast<int32_t>(val);
+-              break;
+-            }
+-            case DataType::F32:
+-            {
+-              *(reinterpret_cast<float *>(out.ptr()) + x) = static_cast<float>(val);
+-              break;
+-            }
+-            default:
+-              ARM_COMPUTE_ERROR("Unsupported data type.");
+-          }
+-        }
+-      },
+-      in, out);
+-}
+-} // namespace
+-
+-NECastKernel::NECastKernel() : _input(nullptr), _output(nullptr), _input_subtype(SubDataType::NONE)
+-{
+-}
+-
+-void NECastKernel::configure(const ITensor *input, ITensor *output, SubDataType input_subtype)
+-{
+-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), input_subtype));
+-
+-  _input = input;
+-  _output = output;
+-  _input_subtype = input_subtype;
+-
+-  // Configure kernel window
+-  auto win_config = validate_and_configure_window(input->info(), output->info());
+-
+-  ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
+-
+-  INEKernel::configure(std::get<1>(win_config));
+-}
+-
+-Status NECastKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+-                              SubDataType input_subtype)
+-{
+-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, input_subtype));
+-  ARM_COMPUTE_RETURN_ON_ERROR(
+-      std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
+-  return Status{};
+-}
+-
+-void NECastKernel::run(const Window &window, const ThreadInfo &info)
+-{
+-  ARM_COMPUTE_UNUSED(info);
+-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+-
+-  switch (_input->info()->data_type())
+-  {
+-    case DataType::U8:
+-      if (_input_subtype == SubDataType::BOOL)
+-      {
+-        run_cast<bool>(_input, _output, window);
+-      }
+-      else
+-      {
+-        run_cast<uint8_t>(_input, _output, window);
+-      }
+-      break;
+-    case DataType::QASYMM8:
+-      run_cast_qasymm8(_input, _output, window);
+-      break;
+-    case DataType::U32:
+-      run_cast<uint32_t>(_input, _output, window);
+-      break;
+-    case DataType::S32:
+-      run_cast<int32_t>(_input, _output, window);
+-      break;
+-    case DataType::F32:
+-      run_cast<float>(_input, _output, window);
+-      break;
+-    default:
+-      ARM_COMPUTE_ERROR("Unsupported data type.");
+-  }
+-}
+-} // namespace arm_compute
+diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.cpp
+deleted file mode 100644
+index 95e269d..0000000
+--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.cpp
++++ /dev/null
+@@ -1,181 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h"
+-
+-#include "arm_compute/core/Helpers.h"
+-#include "arm_compute/core/ITensor.h"
+-#include "arm_compute/core/NEON/wrapper/wrapper.h"
+-#include "arm_compute/core/Types.h"
+-#include "arm_compute/core/Validate.h"
+-#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
+-#include <arm_neon.h>
+-#include <cstdint>
+-
+-using namespace arm_compute::misc::shape_calculator;
+-
+-namespace arm_compute
+-{
+-namespace
+-{
+-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape)
+-{
+-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+-  ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+-  ARM_COMPUTE_RETURN_ERROR_ON(block_shape < 2);
+-
+-  const DataLayout data_layout = input->data_layout();
+-  const int idx_channel =
+-      get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+-  ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channel] % (block_shape * block_shape) !=
+-                              0);
+-  // Validate output if initialized
+-  if (output->total_size() != 0)
+-  {
+-    const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+-    const int idx_height =
+-        get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+-    ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] !=
+-                                (block_shape * input->tensor_shape()[idx_width]));
+-    ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] !=
+-                                (block_shape * input->tensor_shape()[idx_height]));
+-    ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
+-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+-  }
+-
+-  return Status{};
+-}
+-} // namespace
+-
+-NEDepthToSpaceLayerKernelEx::NEDepthToSpaceLayerKernelEx()
+-    : _input(nullptr), _output(nullptr), _block_shape()
+-{
+-}
+-
+-void NEDepthToSpaceLayerKernelEx::configure(const ITensor *input, ITensor *output,
+-                                            int32_t block_shape)
+-{
+-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+-  TensorShape output_shape = compute_depth_to_space_shape_ex(input->info(), block_shape);
+-  // Output auto inizialitation if not yet initialized
+-  auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
+-
+-  // Perform validation step
+-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_shape));
+-
+-  _input = input;
+-  _output = output;
+-  _block_shape = block_shape;
+-
+-  // Configure kernel window
+-  Window win = calculate_max_window(*input->info(), Steps());
+-  ICPPKernel::configure(win);
+-}
+-
+-Status NEDepthToSpaceLayerKernelEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+-                                             int32_t block_shape)
+-{
+-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, block_shape));
+-  return Status{};
+-}
+-
+-void NEDepthToSpaceLayerKernelEx::run(const Window &window, const ThreadInfo &info)
+-{
+-  ARM_COMPUTE_UNUSED(info);
+-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
+-
+-  const int idx_channel =
+-      get_data_layout_dimension_index(_input->info()->data_layout(), DataLayoutDimension::CHANNEL);
+-  const int depth_size = _input->info()->dimension(idx_channel);
+-  const int r = (depth_size / (_block_shape * _block_shape));
+-  const int element_size = _input->info()->element_size();
+-
+-  Window slice_out = window.first_slice_window_3D();
+-
+-  // The slice_out slice does not move
+-  slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+-  slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+-  slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
+-
+-  // Main loop for NCHW and NHWC
+-  if (_input->info()->data_layout() == DataLayout::NCHW)
+-  {
+-    Window slice_in = window.first_slice_window_2D();
+-    do
+-    {
+-      Iterator in(_input, slice_in);
+-      execute_window_loop(slice_in,
+-                          [&](const Coordinates &id) {
+-                            const int x = id.x();
+-                            const int y = id.y();
+-
+-                            const int z = id.z() % r;
+-                            const int out_x = x * _block_shape + (id.z() / r) % _block_shape;
+-                            const int out_y = y * _block_shape + (id.z() / r) / _block_shape;
+-                            Coordinates output_coords{out_x, out_y, z, id[3]};
+-                            memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size);
+-                          },
+-                          in);
+-    } while (window.slide_window_slice_2D(slice_in));
+-  }
+-  else
+-  {
+-    Window slice_in = window.first_slice_window_3D();
+-    do
+-    {
+-      Iterator in(_input, slice_in);
+-      execute_window_loop(slice_in,
+-                          [&](const Coordinates &id) {
+-                            const int x = id.y();
+-                            const int y = id.z();
+-
+-                            const int z = id.x() % r;
+-                            const int out_x = x * _block_shape + (id.x() / r) % _block_shape;
+-                            const int out_y = y * _block_shape + (id.x() / r) / _block_shape;
+-                            Coordinates output_coords{z, out_x, out_y, id[3]};
+-                            memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size);
+-                          },
+-                          in);
+-    } while (window.slide_window_slice_3D(slice_in));
+-  }
+-}
+-} // namespace arm_compute
+diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEElementwiseUnaryKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEElementwiseUnaryKernelEx.cpp
+deleted file mode 100644
+index 200fc4f..0000000
+--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEElementwiseUnaryKernelEx.cpp
++++ /dev/null
+@@ -1,221 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2018-2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h"
+-
+-#include "arm_compute/core/CPP/Validate.h"
+-#include "arm_compute/core/Error.h"
+-#include "arm_compute/core/Helpers.h"
+-#include "arm_compute/core/IAccessWindow.h"
+-#include "arm_compute/core/ITensor.h"
+-#include "arm_compute/core/NEON/NEAsymm.h"
+-#include "arm_compute/core/NEON/NEFixedPoint.h"
+-#include "arm_compute/core/NEON/wrapper/wrapper.h"
+-#include "arm_compute/core/TensorInfo.h"
+-#include "arm_compute/core/Validate.h"
+-
+-#include <algorithm>
+-#include <arm_neon.h>
+-#include <cstdint>
+-#include <map>
+-#include <string>
+-
+-namespace arm_compute
+-{
+-class Coordinates;
+-
+-namespace
+-{
+-template <ElementWiseUnaryEx op, typename ScalarType>
+-inline ScalarType elementwise_op_scalar(const ScalarType &a)
+-{
+-  switch (op)
+-  {
+-    case ElementWiseUnaryEx::NEG:
+-      return -a;
+-    default:
+-      ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+-  }
+-}
+-
+-template <ElementWiseUnaryEx op, typename VectorType>
+-inline VectorType elementwise_op(const VectorType &a)
+-{
+-  switch (op)
+-  {
+-    case ElementWiseUnaryEx::NEG:
+-      return wrapper::vneg(a);
+-    default:
+-      ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+-  }
+-}
+-
+-template <ElementWiseUnaryEx op, typename ScalarType>
+-void elementwise_op(const ITensor *in, ITensor *out, const Window &window)
+-{
+-  const int window_step_x = 16 / sizeof(ScalarType);
+-  const auto window_start_x = static_cast<int>(window.x().start());
+-  const auto window_end_x = static_cast<int>(window.x().end());
+-
+-  Window win = window;
+-  win.set(Window::DimX, Window::Dimension(0, 1, 1));
+-
+-  Iterator input(in, win);
+-  Iterator output(out, win);
+-
+-  execute_window_loop(win,
+-                      [&](const Coordinates &) {
+-                        auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
+-                        const auto input_ptr = reinterpret_cast<const ScalarType *>(input.ptr());
+-
+-                        int x = window_start_x;
+-                        for (; x <= window_end_x - window_step_x; x += window_step_x)
+-                        {
+-                          wrapper::vstore(output_ptr + x,
+-                                          elementwise_op<op>(wrapper::vloadq(input_ptr + x)));
+-                        }
+-                        for (; x < window_end_x; ++x)
+-                        {
+-                          *(output_ptr + x) = elementwise_op_scalar<op>(*(input_ptr + x));
+-                        }
+-                      },
+-                      input, output);
+-}
+-
+-template <ElementWiseUnaryEx op>
+-std::function<void(const ITensor *input, ITensor *output, const Window &window)>
+-configure_func(const ITensor *input, ITensor *output)
+-{
+-  std::string function_to_call("op_");
+-  function_to_call += string_from_data_type(input->info()->data_type()) + "_";
+-  function_to_call += string_from_data_type(output->info()->data_type());
+-
+-  static std::map<std::string, NEElementwiseUnaryKernelEx::ElementwiseUnaryFunction *>
+-      map_function = {
+-          {"op_F32_F32", &elementwise_op<op, float>}, {"op_S32_S32", &elementwise_op<op, int32_t>},
+-      };
+-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+-  map_function["op_F16_F16"] = &elementwise_op<op, float16_t>;
+-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+-
+-  auto it = map_function.find(function_to_call);
+-
+-  if (it != map_function.end())
+-  {
+-    auto func = it->second;
+-    return [func](const ITensor *input, ITensor *output, const Window &window) {
+-      func(input, output, window);
+-    };
+-  }
+-  return nullptr;
+-}
+-} // namespace
+-
+-NEElementwiseUnaryKernelEx::NEElementwiseUnaryKernelEx()
+-    : _function(nullptr), _input(nullptr), _output(nullptr)
+-{
+-}
+-
+-void NEElementwiseUnaryKernelEx::configure(ElementWiseUnaryEx op, const ITensor *input,
+-                                           ITensor *output)
+-{
+-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input->info(), *output->info()));
+-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+-
+-  // Configure kernel window
+-  const std::pair<TensorShape, ValidRegion> broadcast_pair =
+-      ITensorInfo::broadcast_shape_and_valid_region(*input->info());
+-  const TensorShape &out_shape = broadcast_pair.first;
+-  const ValidRegion &valid_region = broadcast_pair.second;
+-
+-  // Auto initialize output if not initialized
+-  auto_init_if_empty(*output->info(), out_shape, 1, input->info()->data_type());
+-
+-  Window win = calculate_max_window(valid_region);
+-
+-  _input = input;
+-  _output = output;
+-
+-  INEKernel::configure(win);
+-
+-  switch (op)
+-  {
+-    case ElementWiseUnaryEx::NEG:
+-      _function = configure_func<ElementWiseUnaryEx::NEG>(input, output);
+-      break;
+-    default:
+-      ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+-  }
+-}
+-
+-Status NEElementwiseUnaryKernelEx::validate_arguments(const ITensorInfo &input,
+-                                                      const ITensorInfo &output)
+-{
+-  ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input);
+-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::F16, DataType::F32,
+-                                                       DataType::S32);
+-
+-  // Validate in case of configured output
+-  if (output.total_size() > 0)
+-  {
+-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &output);
+-  }
+-
+-  return Status{};
+-}
+-
+-Status NEElementwiseUnaryKernelEx::validate(ElementWiseUnaryEx op, const ITensorInfo *input,
+-                                            const ITensorInfo *output)
+-{
+-  ARM_COMPUTE_UNUSED(op);
+-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input, *output));
+-  return Status{};
+-}
+-
+-void NEElementwiseUnaryKernelEx::run(const Window &window, const ThreadInfo &info)
+-{
+-  ARM_COMPUTE_UNUSED(info);
+-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+-  ARM_COMPUTE_ERROR_ON(_function == nullptr);
+-  _function(_input, _output, window);
+-}
+-} // namespace arm_compute
+diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEPReLUKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEPReLUKernel.cpp
+deleted file mode 100644
+index 641641b..0000000
+--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEPReLUKernel.cpp
++++ /dev/null
+@@ -1,291 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2017-2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/core/NEON/kernels/NEPReLUKernel.h"
+-
+-#include "arm_compute/core/ITensor.h"
+-#include "arm_compute/core/NEON/NEAsymm.h"
+-#include "arm_compute/core/NEON/NEElementwiseOperationFuncs.h"
+-#include "arm_compute/core/NEON/wrapper/wrapper.h"
+-#include "arm_compute/core/TensorInfo.h"
+-#include "arm_compute/core/Window.h"
+-
+-#include <arm_neon.h>
+-
+-using namespace arm_compute;
+-namespace
+-{
+-
+-/** Conditional element-wise operations */
+-enum class ConditionalOperation
+-{
+-  PRELU, /**< (x * y) for x < 0, x for x >= 0 */
+-};
+-
+-template <ConditionalOperation op, typename ScalarType>
+-inline ScalarType elementwise_conditional_op_scalar(const ScalarType &a, const ScalarType &b)
+-{
+-  auto res = ScalarType(0);
+-
+-  switch (op)
+-  {
+-    case ConditionalOperation::PRELU:
+-      res = a < 0 ? a * b : a;
+-      break;
+-    default:
+-      ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+-  }
+-  return res;
+-}
+-
+-template <ConditionalOperation op>
+-inline uint8_t elementwise_conditional_op_quantized_scalar(const float &a, const float &b,
+-                                                           QuantizationInfo qinfo)
+-{
+-  return quantize_qasymm8(elementwise_conditional_op_scalar<op>(a, b), qinfo,
+-                          RoundingPolicy::TO_NEAREST_UP);
+-}
+-
+-template <ConditionalOperation op, typename VectorType>
+-inline VectorType elementwise_conditional_op(const VectorType &a, const VectorType &b)
+-{
+-  VectorType res = {0, 0, 0, 0};
+-  VectorType const_0 = {0, 0, 0, 0};
+-
+-  switch (op)
+-  {
+-    case ConditionalOperation::PRELU:
+-      res = wrapper::vbsl(wrapper::vcgt(a, const_0), a, wrapper::vmul(a, b));
+-      ;
+-      break;
+-    default:
+-      ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+-  }
+-  return res;
+-}
+-
+-template <ConditionalOperation op>
+-inline float32x4x4_t elementwise_conditional_op(const float32x4x4_t &a, const float32x4x4_t &b)
+-{
+-  float32x4x4_t out = {{
+-      elementwise_conditional_op<op>(a.val[0], b.val[0]),
+-      elementwise_conditional_op<op>(a.val[1], b.val[1]),
+-      elementwise_conditional_op<op>(a.val[2], b.val[2]),
+-      elementwise_conditional_op<op>(a.val[3], b.val[3]),
+-  }};
+-  return out;
+-}
+-
+-template <ConditionalOperation op, typename ScalarType, typename VectorType>
+-inline VectorType elementwise_conditional_op_broadcast(const VectorType &a,
+-                                                       const ScalarType &broadcast_value,
+-                                                       const bool reorder)
+-{
+-  VectorType broadcast_vector = wrapper::vdup_n(broadcast_value, wrapper::traits::vector_128_tag());
+-  return elementwise_conditional_op<op>(reorder ? broadcast_vector : a,
+-                                        reorder ? a : broadcast_vector);
+-}
+-
+-template <ConditionalOperation op, typename ScalarType, typename VectorType>
+-inline int elementwise_conditional_op_loop(int window_start_x, int window_end_x, int window_step_x,
+-                                           const ScalarType *input1_ptr,
+-                                           const ScalarType *input2_ptr, ScalarType *output_ptr)
+-{
+-  int x = window_start_x;
+-  for (; x <= (window_end_x - window_step_x); x += window_step_x)
+-  {
+-    const auto a = wrapper::vloadq(input1_ptr + x);
+-    const auto b = wrapper::vloadq(input2_ptr + x);
+-    wrapper::vstore(output_ptr + x, elementwise_conditional_op<op>(a, b));
+-  }
+-  return x;
+-}
+-
+-template <ConditionalOperation op>
+-inline int elementwise_conditional_op_quantized_loop(int window_start_x, int window_end_x,
+-                                                     int window_step_x, const uint8_t *input1_ptr,
+-                                                     const uint8_t *input2_ptr, uint8_t *output_ptr,
+-                                                     int32x4_t voffset1, int32x4_t voffset2,
+-                                                     float32x4_t vscale1, float32x4_t vscale2,
+-                                                     float32x4_t voffseto, float32x4_t invvscaleo)
+-{
+-  int x = window_start_x;
+-  for (; x <= (window_end_x - window_step_x); x += window_step_x)
+-  {
+-    // Get inputs and compute output
+-    const float32x4x4_t af = load_quantized(input1_ptr + x, voffset1, vscale1);
+-    const float32x4x4_t bf = load_quantized(input2_ptr + x, voffset2, vscale2);
+-    const float32x4x4_t rf = elementwise_conditional_op<op>(af, bf);
+-    store_quantized(output_ptr + x, rf, voffseto, invvscaleo);
+-  }
+-  return x;
+-}
+-
+-template <ConditionalOperation op, typename ScalarType, typename VectorType>
+-inline int elementwise_conditional_op_broadcast_loop(int window_start_x, int window_end_x,
+-                                                     int window_step_x,
+-                                                     const ScalarType *non_broadcast_input_ptr,
+-                                                     const ScalarType &broadcast_value,
+-                                                     ScalarType *output_ptr, const bool reorder)
+-{
+-  int x = window_start_x;
+-  for (; x <= (window_end_x - window_step_x); x += window_step_x)
+-  {
+-    const auto a = wrapper::vloadq((non_broadcast_input_ptr + x));
+-    wrapper::vstore(output_ptr + x,
+-                    elementwise_conditional_op_broadcast<op>(a, broadcast_value, reorder));
+-  }
+-  return x;
+-}
+-
+-template <ConditionalOperation op>
+-inline int elementwise_conditional_op_quantized_broadcast_loop(
+-    int window_start_x, int window_end_x, int window_step_x, const uint8_t *non_broadcast_input_ptr,
+-    float32x4x4_t broadcast_vector, uint8_t *output_ptr, int32x4_t voffset_non_broadcast,
+-    float32x4_t vscale_non_broadcast, float32x4_t voffseto, float32x4_t invvscaleo, bool reorder)
+-{
+-  int x = window_start_x;
+-  for (; x <= (window_end_x - window_step_x); x += window_step_x)
+-  {
+-    const float32x4x4_t af =
+-        load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
+-    const float32x4x4_t rf = elementwise_conditional_op<op>(reorder ? broadcast_vector : af,
+-                                                            reorder ? af : broadcast_vector);
+-    store_quantized(output_ptr + x, rf, voffseto, invvscaleo);
+-  }
+-  return x;
+-}
+-
+-template <ConditionalOperation op, typename ScalarType, typename VectorType>
+-void elementwise_conditional_op(const ITensor *in1, const ITensor *in2, ITensor *out,
+-                                const Window &window)
+-{
+-  elementwise_op(in1, in2, out, window, &elementwise_conditional_op_scalar<op, ScalarType>,
+-                 &elementwise_conditional_op_broadcast_loop<op, ScalarType, VectorType>,
+-                 &elementwise_conditional_op_loop<op, ScalarType, VectorType>);
+-}
+-
+-template <ConditionalOperation op>
+-void elementwise_conditional_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out,
+-                                          const Window &window)
+-{
+-  elementwise_op_quantized(in1, in2, out, window, &elementwise_conditional_op_quantized_scalar<op>,
+-                           &elementwise_conditional_op_quantized_broadcast_loop<op>,
+-                           &elementwise_conditional_op_quantized_loop<op>);
+-}
+-} // namespace
+-
+-NEPReLUKernel::NEPReLUKernel() : _input(nullptr), _alpha(nullptr), _output(nullptr) {}
+-
+-void NEPReLUKernel::configure(const ITensor *input, const ITensor *alpha, ITensor *output)
+-{
+-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, alpha, output);
+-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input->info(), *alpha->info(), *output->info()));
+-
+-  // Configure kernel window
+-  const std::pair<TensorShape, ValidRegion> broadcast_pair =
+-      ITensorInfo::broadcast_shape_and_valid_region(*input->info(), *alpha->info());
+-  const TensorShape &out_shape = broadcast_pair.first;
+-  const ValidRegion &valid_region = broadcast_pair.second;
+-
+-  // Auto initialize output if not initialized
+-  auto_init_if_empty(*output->info(), out_shape, 1, input->info()->data_type());
+-
+-  Window win = calculate_max_window(valid_region);
+-
+-  _input = input;
+-  _alpha = alpha;
+-  _output = output;
+-  INEKernel::configure(win);
+-}
+-
+-void NEPReLUKernel::run(const Window &window, const ThreadInfo &info)
+-{
+-  ARM_COMPUTE_UNUSED(info);
+-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+-
+-  if (_input->info()->data_type() == DataType::F32)
+-  {
+-    elementwise_conditional_op<ConditionalOperation::PRELU, float, float32x4_t>(_input, _alpha,
+-                                                                                _output, window);
+-  }
+-  else if (_input->info()->data_type() == DataType::QASYMM8)
+-  {
+-    elementwise_conditional_op_quantized<ConditionalOperation::PRELU>(_input, _alpha, _output,
+-                                                                      window);
+-  }
+-  else
+-  {
+-    ARM_COMPUTE_ERROR("Wrong Type");
+-  }
+-}
+-
+-Status NEPReLUKernel::validate_arguments(const ITensorInfo &input, const ITensorInfo &alpha,
+-                                         const ITensorInfo &output)
+-{
+-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::F32);
+-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &alpha, &output);
+-
+-  const TensorShape out_shape =
+-      TensorShape::broadcast_shape(input.tensor_shape(), alpha.tensor_shape());
+-
+-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
+-                                  "Inputs are not broadcast compatible");
+-
+-  // Checks performed when output is configured
+-  if (output.total_size() > 0)
+-  {
+-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+-        detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
+-        "Wrong shape for output");
+-  }
+-
+-  return Status{};
+-}
+-
+-Status NEPReLUKernel::validate(const ITensorInfo *input, const ITensorInfo *alpha,
+-                               const ITensorInfo *output)
+-{
+-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, alpha, output);
+-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input, *alpha, *output));
+-
+-  return Status{};
+-}
+diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp
+index 6ba0f1f..5841f1d 100644
+--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp
++++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp
+@@ -64,7 +64,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+   ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2);
+   ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+   ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape().total_size() == 0);
+-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S8);
++  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8_SIGNED);
+   ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+   ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scale_factor, 1, DataType::F16,
+                                                        DataType::F32);
+diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NESpaceToDepthLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NESpaceToDepthLayerKernelEx.cpp
+deleted file mode 100644
+index 44feb20..0000000
+--- a/compute/ARMComputeEx/src/core/NEON/kernels/NESpaceToDepthLayerKernelEx.cpp
++++ /dev/null
+@@ -1,181 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h"
+-
+-#include "arm_compute/core/Helpers.h"
+-#include "arm_compute/core/ITensor.h"
+-#include "arm_compute/core/NEON/wrapper/wrapper.h"
+-#include "arm_compute/core/Types.h"
+-#include "arm_compute/core/Validate.h"
+-#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
+-#include <arm_neon.h>
+-#include <cstdint>
+-
+-using namespace arm_compute::misc::shape_calculator;
+-
+-namespace arm_compute
+-{
+-namespace
+-{
+-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape)
+-{
+-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+-  ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+-
+-  ARM_COMPUTE_RETURN_ERROR_ON(block_shape < 1);
+-
+-  // Validate output if initialized
+-  if (output->total_size() != 0)
+-  {
+-    const DataLayout data_layout = input->data_layout();
+-    const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+-    const int idx_height =
+-        get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+-    const int idx_channel =
+-        get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+-    const int idx_batch =
+-        get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+-    ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_width] % block_shape != 0);
+-    ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_height] % block_shape != 0);
+-    ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_batch] !=
+-                                output->tensor_shape()[idx_batch]);
+-    ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_channel] % (block_shape * block_shape) !=
+-                                0);
+-    ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape().total_size() !=
+-                                output->tensor_shape().total_size());
+-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+-  }
+-
+-  return Status{};
+-}
+-} // namespace
+-
+-NESpaceToDepthLayerKernelEx::NESpaceToDepthLayerKernelEx()
+-    : _input(nullptr), _output(nullptr), _block_shape()
+-{
+-}
+-
+-void NESpaceToDepthLayerKernelEx::configure(const ITensor *input, ITensor *output,
+-                                            int32_t block_shape)
+-{
+-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+-
+-  TensorShape output_shape = compute_space_to_depth_shape_ex(input->info(), block_shape);
+-  auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
+-
+-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_shape));
+-
+-  _input = input;
+-  _block_shape = block_shape;
+-  _output = output;
+-
+-  // Configure kernel window
+-  Window win = calculate_max_window(*output->info(), Steps());
+-  INEKernel::configure(win);
+-}
+-
+-Status NESpaceToDepthLayerKernelEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+-                                             int32_t block_shape)
+-{
+-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, block_shape));
+-  return Status{};
+-}
+-
+-void NESpaceToDepthLayerKernelEx::run(const Window &window, const ThreadInfo &info)
+-{
+-  ARM_COMPUTE_UNUSED(info);
+-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
+-
+-  const DataLayout data_layout = _input->info()->data_layout();
+-  const int channel_idx =
+-      get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+-  const int element_size = _input->info()->element_size();
+-
+-  const size_t channel_size = _input->info()->dimension(channel_idx);
+-
+-  Window slice_out = window.first_slice_window_3D();
+-
+-  int batch_id = 0;
+-
+-  // Main loop for NCHW and NHWC
+-  if (_output->info()->data_layout() == DataLayout::NCHW)
+-  {
+-    do
+-    {
+-      Iterator out(_output, slice_out);
+-      execute_window_loop(slice_out,
+-                          [&](const Coordinates &id) {
+-                            const size_t channel_id = id.z();
+-                            const size_t in_x =
+-                                id.x() * _block_shape + (channel_id / channel_size) % _block_shape;
+-                            const size_t in_y =
+-                                id.y() * _block_shape + (channel_id / channel_size) / _block_shape;
+-                            const int z = channel_id % channel_size;
+-                            Coordinates input_coords{in_x, in_y, z, batch_id};
+-                            memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
+-                          },
+-                          out);
+-      ++batch_id;
+-    } while (window.slide_window_slice_3D(slice_out));
+-  }
+-  else
+-  {
+-    do
+-    {
+-      Iterator out(_output, slice_out);
+-      execute_window_loop(slice_out,
+-                          [&](const Coordinates &id) {
+-                            const size_t channel_id = id.x();
+-                            const size_t in_x =
+-                                id.y() * _block_shape + (channel_id / channel_size) % _block_shape;
+-                            const size_t in_y =
+-                                id.z() * _block_shape + (channel_id / channel_size) / _block_shape;
+-                            const int z = channel_id % channel_size;
+-                            Coordinates input_coords{z, in_x, in_y, batch_id};
+-                            memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
+-                          },
+-                          out);
+-      ++batch_id;
+-    } while (window.slide_window_slice_3D(slice_out));
+-  }
+-}
+-} // namespace arm_compute
+diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLArgOperation.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLArgOperation.cpp
+deleted file mode 100644
+index 2d379cf..0000000
+--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLArgOperation.cpp
++++ /dev/null
+@@ -1,144 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2017 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/runtime/CL/functions/CLArgOperation.h"
+-
+-#include "arm_compute/core/CL/kernels/CLArgOperationKernel.h"
+-#include "arm_compute/runtime/CL/CLScheduler.h"
+-
+-namespace arm_compute
+-{
+-
+-CLArgOperation::CLArgOperation()
+-{
+-  // DO NOTHING
+-}
+-
+-void CLArgOperation::configure(ICLTensor *input, ICLTensor *output, std::vector<uint32_t> axis,
+-                               ArgOperation op)
+-{
+-  ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), axis, output->info(), op));
+-  _input = input;
+-  _output = output;
+-  _axis = axis;
+-  _arg_op = op;
+-  // NOTE The argminmax_axis must have no duplication.
+-  _num_of_kernels = axis.size();
+-  const size_t num_of_interm_tensors = _num_of_kernels - 1;
+-
+-  _interm_tensors = arm_compute::support::cpp14::make_unique<CLTensor[]>(num_of_interm_tensors);
+-  _argop_kernels =
+-      arm_compute::support::cpp14::make_unique<CLArgOperationKernel[]>(_num_of_kernels);
+-
+-  TensorShape shape{input->info()->tensor_shape()};
+-  for (size_t i = 0; i < num_of_interm_tensors; i++)
+-  {
+-    shape.set(_axis[i], 1);
+-    _interm_tensors[i].allocator()->init(
+-        TensorInfo(shape, input->info()->num_channels(), input->info()->data_type())
+-            .set_data_layout(input->info()->data_layout()));
+-    _interm_tensors[i].allocator()->allocate();
+-  }
+-
+-  // Set a vector that is ordered ICLTensors sequentially.
+-  std::vector<ICLTensor *> tensors;
+-  tensors.emplace_back(input);
+-  for (size_t i = 0; i < num_of_interm_tensors; i++)
+-  {
+-    tensors.emplace_back(_interm_tensors.get() + i);
+-  }
+-  tensors.emplace_back(output);
+-
+-  // Apply ArgMinMax on all kernels
+-  for (size_t i = 0; i < _num_of_kernels; i++)
+-  {
+-    _argop_kernels[i].configure(tensors[i], tensors[i + 1], _axis[i], op);
+-  }
+-}
+-
+-Status CLArgOperation::validate(const ITensorInfo *input, const std::vector<uint32_t> &axis,
+-                                const ITensorInfo *output, ArgOperation op)
+-{
+-  const size_t num_of_kernels = axis.size();
+-  const size_t num_of_interm_tensors = num_of_kernels - 1;
+-
+-  // Create temporary tensor infos
+-  auto interm_tensors =
+-      arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_interm_tensors);
+-
+-  // Create intermediate tensor info
+-  TensorShape shape{input->tensor_shape()};
+-
+-  for (size_t i = 0; i < num_of_interm_tensors; i++)
+-  {
+-    shape.set(axis[i], 1);
+-    interm_tensors[i].set_data_type(input->data_type());
+-    interm_tensors[i].set_tensor_shape(shape);
+-    interm_tensors[i].set_num_channels(input->num_channels());
+-  }
+-
+-  // Set a vector that is ordered ITensorInfo sequentially.
+-  std::vector<const ITensorInfo *> tensors;
+-  tensors.emplace_back(input);
+-  for (size_t i = 0; i < num_of_interm_tensors; i++)
+-  {
+-    tensors.emplace_back(interm_tensors.get() + i);
+-  }
+-  tensors.emplace_back(output);
+-
+-  // Validate argminmax only on all kernels
+-  for (size_t i = 0; i < num_of_kernels; i++)
+-  {
+-    ARM_COMPUTE_RETURN_ON_ERROR(
+-        CLArgOperationKernel::validate(tensors[i], tensors[i + 1], axis[i], op));
+-  }
+-
+-  return Status{};
+-}
+-
+-void CLArgOperation::run()
+-{
+-  for (size_t i = 0; i < _num_of_kernels; ++i)
+-  {
+-    CLScheduler::get().enqueue(_argop_kernels[i]);
+-  }
+-}
+-
+-} // namespace arm_compute
+diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp
+index 92ee69a..e5122ab 100644
+--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp
++++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp
+@@ -48,7 +48,7 @@ using namespace arm_compute;
+ void CLBinaryLogicalOp::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output,
+                                   BinaryLogicalOperation op)
+ {
+-  auto k = arm_compute::support::cpp14::make_unique<CLBinaryLogicalOpKernel>();
++  auto k = support::cpp14::make_unique<CLBinaryLogicalOpKernel>();
+   k->configure(input1, input2, output, op);
+   _kernel = std::move(k);
+ 
+diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp
+deleted file mode 100644
+index b3118f3..0000000
+--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp
++++ /dev/null
+@@ -1,52 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2016-2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/runtime/CL/functions/CLCast.h"
+-
+-#include "arm_compute/core/CL/kernels/CLCastKernel.h"
+-
+-using namespace arm_compute;
+-
+-void CLCast::configure(ICLTensor *input, ICLTensor *output, SubDataType input_subtype)
+-{
+-  auto k = arm_compute::support::cpp14::make_unique<CLCastKernel>();
+-  k->configure(input, output, input_subtype);
+-  _kernel = std::move(k);
+-}
+diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp
+deleted file mode 100644
+index db66250..0000000
+--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp
++++ /dev/null
+@@ -1,52 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2016-2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/runtime/CL/functions/CLDepthToSpace.h"
+-
+-#include "arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h"
+-
+-using namespace arm_compute;
+-
+-void CLDepthToSpace::configure(ICLTensor *input, ICLTensor *output, const int32_t block_size)
+-{
+-  auto k = arm_compute::support::cpp14::make_unique<CLDepthToSpaceKernel>();
+-  k->configure(input, output, block_size);
+-  _kernel = std::move(k);
+-}
+diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp
+new file mode 100644
+index 0000000..3dede05
+--- /dev/null
++++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp
+@@ -0,0 +1,267 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *      http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++/*
++ * Copyright (c) 2019-2020 ARM Limited.
++ *
++ * SPDX-License-Identifier: MIT
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in all
++ * copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++#include "arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h"
++
++#include "arm_compute/core/Helpers.h"
++#include "arm_compute/core/UtilsEx.h"
++#include "arm_compute/core/Validate.h"
++#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
++#include "arm_compute/runtime/CL/CLScheduler.h"
++
++#include <memory>
++#include <tuple>
++
++namespace arm_compute
++{
++using namespace arm_compute::misc::shape_calculator;
++
++CLDirectTransposeConvLayer::CLDirectTransposeConvLayer(
++    std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
++    : _memory_group(std::move(memory_manager)),
++      _scale_f(),
++      _conv_f(),
++      _flip_weights(),
++      _scaled_output(),
++      _original_weights(nullptr),
++      _weights_flipped(),
++      _flip_axis(),
++      _is_prepared(false)
++{
++}
++
++Status CLDirectTransposeConvLayer::validate(const ITensorInfo *input, const ITensorInfo *weights,
++                                            const ITensorInfo *bias, ITensorInfo *output,
++                                            const PadStrideInfo &info, unsigned int invalid_right,
++                                            unsigned int invalid_bottom,
++                                            const WeightsInfo &weights_info)
++{
++  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
++  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
++      input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32);
++  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
++  const DataLayout data_layout = input->data_layout();
++
++  const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
++  const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
++  const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
++
++  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) != weights->dimension(idx_h));
++  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) < 1);
++
++  auto out_dims = transposeconv_output_dimensions(
++      input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w),
++      weights->dimension(idx_h), info, invalid_right, invalid_bottom);
++
++  const TensorShape output_shape = compute_transposeconv_output_shape(out_dims, *input, *weights);
++
++  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights);
++
++  if (bias != nullptr)
++  {
++    if (is_data_type_quantized_asymmetric(input->data_type()))
++    {
++      ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
++    }
++    else
++    {
++      ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
++    }
++    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, bias);
++  }
++
++  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_w) != output_shape[idx_w],
++                                  "Output's width is invalid.");
++  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_h) != output_shape[idx_h],
++                                  "Output's height is invalid.");
++  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_c) != output_shape[idx_c],
++                                  "Output's depth is invalid.");
++
++  unsigned int pad_left = 0;
++  unsigned int pad_right = 0;
++  unsigned int pad_top = 0;
++  unsigned int pad_bottom = 0;
++  const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
++      *input, *weights, info, out_dims, invalid_right, invalid_bottom, pad_left, pad_right, pad_top,
++      pad_bottom);
++  TensorInfo scale_out_info(input->clone()
++                                ->set_is_resizable(true)
++                                .reset_padding()
++                                .set_tensor_shape(scale_out_shape)
++                                .set_data_layout(data_layout));
++  const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
++
++  ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionLayerUpsample::validate(input, &scale_out_info, info));
++  ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output,
++                                                           conv_info, weights_info));
++
++  return Status{};
++}
++
++void CLDirectTransposeConvLayer::configure(ICLTensor *input, ICLTensor *weights,
++                                           const ICLTensor *bias, ICLTensor *output,
++                                           const PadStrideInfo &info, unsigned int invalid_right,
++                                           unsigned int invalid_bottom,
++                                           const WeightsInfo &weights_info)
++{
++  configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, info,
++            invalid_right, invalid_bottom, weights_info);
++}
++
++void CLDirectTransposeConvLayer::configure(const CLCompileContext &compile_context,
++                                           ICLTensor *input, ICLTensor *weights,
++                                           const ICLTensor *bias, ICLTensor *output,
++                                           const PadStrideInfo &info, unsigned int invalid_right,
++                                           unsigned int invalid_bottom,
++                                           const WeightsInfo &weights_info)
++{
++  ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
++
++  unsigned int pad_left = 0;
++  unsigned int pad_right = 0;
++  unsigned int pad_top = 0;
++  unsigned int pad_bottom = 0;
++  const unsigned int stride_x = info.stride().first;
++  const unsigned int stride_y = info.stride().second;
++
++  const DataLayout data_layout = input->info()->data_layout();
++
++  const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
++  const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
++
++  _original_weights = weights;
++  _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32));
++  _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
++  _flip_weights.configure(compile_context, weights, &_weights_flipped, &_flip_axis);
++
++  auto out_dims = transposeconv_output_dimensions(
++      input->info()->dimension(idx_w), input->info()->dimension(idx_h),
++      weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info, invalid_right,
++      invalid_bottom);
++
++  const TensorShape output_shape =
++      compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info());
++
++  // Output auto initialization if not yet initialized
++  auto_init_if_empty(
++      *output->info(),
++      input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout));
++
++  // Perform validation step
++  ARM_COMPUTE_ERROR_THROW_ON(CLDirectTransposeConvLayer::validate(
++      input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(),
++      info, invalid_right, invalid_bottom));
++
++  _is_prepared = weights_info.retain_internal_weights();
++
++  _memory_group.manage(&_scaled_output);
++
++  // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order
++  // to match output shape
++  const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
++      *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left,
++      pad_right, pad_top, pad_bottom);
++
++  TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(),
++                            input->info()->quantization_info());
++  scale_out_info.set_data_layout(data_layout);
++  _scaled_output.allocator()->init(scale_out_info);
++
++  // configure scale function
++  const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom,
++                                    DimensionRoundingType::FLOOR);
++  _scale_f.configure(input, &_scaled_output, upsample_info);
++
++  // Setup the function to convolve the upscaled output
++  const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
++  _conv_f.configure(compile_context, &_scaled_output, &_weights_flipped, bias, output, conv_info,
++                    weights_info);
++  _scaled_output.allocator()->allocate();
++
++  // Setup flip axis data
++  _flip_axis.allocator()->allocate();
++  _flip_axis.map(true);
++  auto axis_data = reinterpret_cast<uint32_t *>(_flip_axis.buffer());
++  if (weights->info()->data_layout() == DataLayout::NHWC)
++  {
++    axis_data[0] = 1;
++    axis_data[1] = 2;
++  }
++  else
++  {
++    axis_data[0] = 0;
++    axis_data[1] = 1;
++  }
++  _flip_axis.unmap();
++}
++
++void CLDirectTransposeConvLayer::run()
++{
++  prepare();
++
++  MemoryGroupResourceScope scope_mg(_memory_group);
++
++  _scale_f.run();
++  _conv_f.run();
++}
++
++void CLDirectTransposeConvLayer::prepare()
++{
++  if (!_is_prepared)
++  {
++    ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
++
++    // Run weights flipping and mark original weights tensor as unused
++    _weights_flipped.allocator()->allocate();
++    _flip_weights.run();
++    _original_weights->mark_as_unused();
++
++    // Prepare convolution
++    _conv_f.prepare();
++
++    // Free flipped weights
++    if (!_weights_flipped.is_used())
++    {
++      _weights_flipped.allocator()->free();
++    }
++
++    _is_prepared = true;
++  }
++}
++} // namespace arm_compute
+diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp
+index 3d9a28a..ae9d8af 100644
+--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp
++++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp
+@@ -47,7 +47,7 @@ using namespace arm_compute;
+ void CLEmbeddingLookup::configure(const ICLTensor *input, ICLTensor *output,
+                                   const ICLTensor *lookups)
+ {
+-  auto k = arm_compute::support::cpp14::make_unique<CLEmbeddingLookupKernel>();
++  auto k = support::cpp14::make_unique<CLEmbeddingLookupKernel>();
+   k->configure(input, output, lookups);
+   _kernel = std::move(k);
+ }
+diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp
+index f098832..0198946 100644
+--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp
++++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp
+@@ -45,7 +45,7 @@
+ #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+ #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+ #include "arm_compute/runtime/CL/CLScheduler.h"
+-#include "support/ToolchainSupport.h"
++#include "support/MemorySupport.h"
+ 
+ #include <algorithm>
+ 
+@@ -60,7 +60,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
+   ARM_COMPUTE_UNUSED(weights);
+   ARM_COMPUTE_UNUSED(output);
+   ARM_COMPUTE_RETURN_ON_ERROR(
+-      CLGEMMLowpMatrixMultiplyCoreEx::validate(&input, &weights, nullptr, &output));
++      CLGEMMLowpMatrixMultiplyCore::validate(&input, &weights, nullptr, &output));
+ 
+   return Status{};
+ }
+@@ -68,7 +68,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
+ 
+ void CLFullyConnectedHybridLayerReshapeWeights::configure(const ICLTensor *input, ICLTensor *output)
+ {
+-  auto k = arm_compute::support::cpp14::make_unique<CLTransposeKernel>();
++  auto k = support::cpp14::make_unique<CLTransposeKernel>();
+   k->configure(input, output);
+   _kernel = std::move(k);
+ }
+@@ -172,7 +172,8 @@ void CLFullyConnectedHybridLayer::configure(const ICLTensor *input, const ICLTen
+ 
+   // Quantize input
+   _quantized_input.allocator()->init(
+-      input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S8));
++      input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(
++          DataType::QASYMM8_SIGNED));
+   _memory_group.manage(&_quantized_input);
+   _quant_input_kernel.configure(input, &_scale_factor, &_quantized_input);
+ 
+@@ -199,7 +200,7 @@ Status CLFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe
+ {
+   ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+   ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::S8);
++  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8_SIGNED);
+   ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+   ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
+ 
+@@ -256,8 +257,9 @@ Status CLFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe
+   ARM_COMPUTE_RETURN_ON_ERROR(CLScaleFactorSymm8Kernel::validate(input, &scale_factor));
+ 
+   // Validate quantization symm8 kernel
+-  const ITensorInfo &quantized_input = TensorInfo(
+-      input->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S8));
++  const ITensorInfo &quantized_input =
++      TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_data_type(
++          DataType::QASYMM8_SIGNED));
+   ARM_COMPUTE_RETURN_ON_ERROR(
+       CLQuantizationSymmetricKernel::validate(input, &scale_factor, &quantized_input));
+ 
+diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp
+index 63e291b..2ff4b96 100644
+--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp
++++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp
+@@ -46,7 +46,7 @@
+ #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+ #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+ #include "arm_compute/runtime/CL/CLScheduler.h"
+-#include "support/ToolchainSupport.h"
++#include "support/MemorySupport.h"
+ 
+ #include <algorithm>
+ 
+@@ -141,7 +141,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
+ 
+ void CLFullyConnectedLayerReshapeWeightsEx::configure(const ICLTensor *input, ICLTensor *output)
+ {
+-  auto k = arm_compute::support::cpp14::make_unique<CLTransposeKernel>();
++  auto k = support::cpp14::make_unique<CLTransposeKernel>();
+   k->configure(input, output);
+   _kernel = std::move(k);
+ }
+diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp
+index 9aebc47..157b4d9 100644
+--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp
++++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp
+@@ -53,18 +53,21 @@ void CLFullyConnectedReshapingLayer::configure(const arm_compute::ICLTensor *inp
+       fc->configure(input_to_use, _weights, _biases, _output);
+       return std::unique_ptr<arm_compute::IFunction>(fc);
+     }
+-    else
++    else if (kernel_type == KernelType::PREPROCESSED_WEIGHTS)
+     {
+-      assert(kernel_type == KernelType::PREPROCESSED_WEIGHTS);
+-
+       bool is_hybrid = (input->info()->data_type() == DataType::F32 ||
+                         input->info()->data_type() == DataType::F16) &&
+-                       weights->info()->data_type() == DataType::S8;
++                       (weights->info()->data_type() == DataType::S8 ||
++                        weights->info()->data_type() == DataType::QASYMM8_SIGNED);
+ 
+       if (is_hybrid)
+       {
+         auto fc = new arm_compute::CLFullyConnectedHybridLayer{_memory_manager};
++        ITensorInfo *weights_info = const_cast<ITensorInfo *>(_weights->info());
++        const auto orgin_weights_data_type = weights_info->data_type();
++        weights_info->set_data_type(DataType::QASYMM8_SIGNED);
+         fc->configure(input_to_use, _weights, _biases, _output);
++        weights_info->set_data_type(orgin_weights_data_type);
+         return std::unique_ptr<arm_compute::IFunction>(fc);
+       }
+       else
+@@ -74,6 +77,11 @@ void CLFullyConnectedReshapingLayer::configure(const arm_compute::ICLTensor *inp
+         return std::unique_ptr<arm_compute::IFunction>(fc);
+       }
+     }
++    else
++    {
++      throw std::runtime_error("CLFullyConnectedReshapingLayer: Unsupported kernel type");
++    }
++
+   }();
+ 
+   if (_needs_reshape)
+diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.cpp
+deleted file mode 100644
+index ca5499d..0000000
+--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.cpp
++++ /dev/null
+@@ -1,180 +0,0 @@
+-/*
+- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2017-2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.h"
+-
+-#include "arm_compute/core/CL/ICLTensor.h"
+-#include "arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h"
+-#include "arm_compute/core/Error.h"
+-#include "arm_compute/core/Helpers.h"
+-#include "arm_compute/core/TensorInfo.h"
+-#include "arm_compute/core/Types.h"
+-#include "arm_compute/core/Validate.h"
+-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+-#include "arm_compute/runtime/CL/CLScheduler.h"
+-#include "arm_compute/runtime/MemoryGroup.h"
+-
+-namespace arm_compute
+-{
+-using namespace arm_compute::misc::shape_calculator;
+-using namespace arm_compute::cl_gemm;
+-
+-namespace
+-{
+-inline bool is_gemm_reshaped(bool reshape_b_only_on_first_run, GPUTarget gpu_target)
+-{
+-  return (get_arch_from_target(gpu_target) != GPUTarget::MIDGARD) && (reshape_b_only_on_first_run);
+-}
+-} // namespace
+-
+-CLGEMMLowpMatrixMultiplyCoreEx::CLGEMMLowpMatrixMultiplyCoreEx(
+-    std::shared_ptr<IMemoryManager> memory_manager)
+-    : _memory_group(std::move(memory_manager)), _mm_midgard_kernel(), _mtx_a_reduction_kernel(),
+-      _mtx_b_reduction_kernel(), _vector_sum_col(), _vector_sum_row(), _a_offset(0), _b_offset(0),
+-      _reshape_b_only_on_first_run(false), _is_prepared(false)
+-{
+-}
+-
+-void CLGEMMLowpMatrixMultiplyCoreEx::configure(const ICLTensor *a, const ICLTensor *b,
+-                                               const ICLTensor *c, ICLTensor *output,
+-                                               const GEMMInfo &gemm_info)
+-{
+-  ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
+-  ARM_COMPUTE_UNUSED(c);
+-  ARM_COMPUTE_ERROR_THROW_ON(CLGEMMLowpMatrixMultiplyCoreEx::validate(
+-      a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info));
+-
+-  _is_prepared = false;
+-  _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
+-  _a_offset = a->info()->quantization_info().uniform().offset;
+-  _b_offset = b->info()->quantization_info().uniform().offset;
+-
+-  // Get the GPU target
+-  const GPUTarget gpu_target = CLScheduler::get().target();
+-
+-  // Set the target for the kernels
+-  _mm_midgard_kernel.set_target(gpu_target);
+-
+-  // GEMMRHSMatrixInfo rhs_info;
+-  // GEMMLHSMatrixInfo lhs_info;
+-
+-  // Arguments used by GEMMReshapeInfo
+-  // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m,
+-  // n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
+-  // in order to know how the matrices have been reshaped
+-  bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+-  const unsigned int m = reinterpret_input_as_3d
+-                             ? (a->info()->dimension(1) * a->info()->dimension(2))
+-                             : a->info()->dimension(1);
+-  const unsigned int n = b->info()->dimension(0);
+-  const unsigned int k = a->info()->dimension(0);
+-  const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+-
+-  const ICLTensor *matrix_b = b;
+-  // Configure matrix multiply kernel
+-  _mm_midgard_kernel.configure(
+-      a, matrix_b, output,
+-      GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+-}
+-
+-Status CLGEMMLowpMatrixMultiplyCoreEx::validate(const ITensorInfo *a, const ITensorInfo *b,
+-                                                const ITensorInfo *c, const ITensorInfo *output,
+-                                                const GEMMInfo &gemm_info)
+-{
+-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::S8);
+-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
+-  ARM_COMPUTE_UNUSED(c);
+-
+-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(),
+-                                  "Matrix A already reshaped is not supported");
+-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(),
+-                                  "Matrix B already reshaped is not supported");
+-
+-  const ITensorInfo *matrix_a_info = a;
+-
+-  // Get the GPU target
+-  const GPUTarget gpu_target = CLScheduler::get().target();
+-
+-  bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+-  const unsigned int m =
+-      reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+-  const unsigned int n = b->dimension(0);
+-  const unsigned int k = a->dimension(0);
+-  const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+-
+-  bool reshape_matrix_b = is_gemm_reshaped(gemm_info.reshape_b_only_on_first_run(), gpu_target);
+-
+-  const GEMMReshapeInfo reshape_info =
+-      GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
+-
+-  TensorInfo weights_info(*b);
+-  const ITensorInfo *matrix_b_info = &weights_info;
+-  if (reshape_matrix_b)
+-  {
+-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(false,
+-                                    "CLGEMMLowpMatrixMultiplyCoreEx does not support reshape_b");
+-  }
+-
+-  // Validate matrix multiply
+-  ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernelEx::validate(
+-      matrix_a_info, matrix_b_info, output, reshape_info));
+-
+-  return Status{};
+-}
+-
+-void CLGEMMLowpMatrixMultiplyCoreEx::run()
+-{
+-  prepare();
+-
+-  MemoryGroupResourceScope scope_mg(_memory_group);
+-
+-  // Run matrix multiply
+-  CLScheduler::get().enqueue(_mm_midgard_kernel, false);
+-}
+-
+-void CLGEMMLowpMatrixMultiplyCoreEx::prepare()
+-{
+-  if (!_is_prepared)
+-  {
+-    _is_prepared = true;
+-  }
+-}
+-} // namespace arm_compute
+diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp
+index f594d7a..e0b833b 100644
+--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp
++++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp
+@@ -48,7 +48,7 @@ using namespace arm_compute;
+ void CLGatherEx::configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output,
+                            int axis)
+ {
+-  auto k = arm_compute::support::cpp14::make_unique<CLGatherExKernel>();
++  auto k = support::cpp14::make_unique<CLGatherExKernel>();
+   k->configure(input, indices, output, axis);
+   _kernel = std::move(k);
+ }
+diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp
+index 27ed8e8..65b89a3 100644
+--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp
++++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp
+@@ -47,7 +47,7 @@ using namespace arm_compute;
+ void CLHashtableLookup::configure(const ICLTensor *lookups, const ICLTensor *keys,
+                                   const ICLTensor *input, ICLTensor *output, ICLTensor *hits)
+ {
+-  auto k = arm_compute::support::cpp14::make_unique<CLHashtableLookupKernel>();
++  auto k = support::cpp14::make_unique<CLHashtableLookupKernel>();
+   k->configure(lookups, keys, input, output, hits);
+   _kernel = std::move(k);
+ }
+diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp
+index 80393e8..5a7e408 100644
+--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp
++++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp
+@@ -50,7 +50,7 @@ CLInstanceNormalizationLayerEx::CLInstanceNormalizationLayerEx() {}
+ void CLInstanceNormalizationLayerEx::configure(ICLTensor *input, ICLTensor *output,
+                                                ICLTensor *gamma, ICLTensor *beta, float epsilon)
+ {
+-  auto k = arm_compute::support::cpp14::make_unique<CLInstanceNormalizationLayerKernelEx>();
++  auto k = support::cpp14::make_unique<CLInstanceNormalizationLayerKernelEx>();
+   k->configure(input, output, gamma, beta, epsilon);
+   _kernel = std::move(k);
+ }
+diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp
+deleted file mode 100644
+index fbb15ab..0000000
+--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp
++++ /dev/null
+@@ -1,63 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2016-2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/runtime/CL/functions/CLPReLU.h"
+-
+-#include "arm_compute/core/CL/kernels/CLPReLUKernel.h"
+-#include "arm_compute/core/CL/ICLTensor.h"
+-
+-using namespace arm_compute;
+-
+-void CLPReLU::configure(ICLTensor *input, ICLTensor *alpha, ICLTensor *output)
+-{
+-  auto k = arm_compute::support::cpp14::make_unique<CLPReLUKernel>();
+-  k->configure(input, alpha, output);
+-  _kernel = std::move(k);
+-
+-  if (output->info()->dimension(0) > 1)
+-  {
+-    ICLTensor *broadcasted_info = (input->info()->dimension(0) == 1) ? input : alpha;
+-
+-    if (broadcasted_info->info()->dimension(0) == 1)
+-    {
+-      _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+-    }
+-  }
+-}
+diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLRNNLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLRNNLayerEx.cpp
+deleted file mode 100644
+index 6049b7e..0000000
+--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLRNNLayerEx.cpp
++++ /dev/null
+@@ -1,163 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/runtime/CL/functions/CLRNNLayerEx.h"
+-
+-#include "arm_compute/core/Helpers.h"
+-#include "arm_compute/core/Types.h"
+-#include "arm_compute/core/Utils.h"
+-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+-#include "arm_compute/runtime/CL/CLScheduler.h"
+-#include "support/ToolchainSupport.h"
+-
+-#include <utility>
+-
+-using namespace arm_compute;
+-using namespace arm_compute::misc::shape_calculator;
+-
+-CLRNNLayerEx::CLRNNLayerEx(std::shared_ptr<IMemoryManager> memory_manager)
+-    : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(),
+-      _activation_kernel(), _fully_connected_kernel(), _copy_kernel(), _fully_connected_out(),
+-      _gemm_output(), _add_output(), _is_prepared(false)
+-{
+-}
+-
+-Status CLRNNLayerEx::validate(const ITensorInfo *input, const ITensorInfo *weights,
+-                              const ITensorInfo *recurrent_weights, const ITensorInfo *bias,
+-                              const ITensorInfo *hidden_state, const ITensorInfo *output,
+-                              const ActivationLayerInfo &info)
+-{
+-  const int idx_width = 0;
+-  const int idx_height = 1;
+-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state,
+-                                      output);
+-  ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_width) != weights->dimension(idx_width));
+-  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_height) !=
+-                              recurrent_weights->dimension(idx_width));
+-  ARM_COMPUTE_RETURN_ERROR_ON(recurrent_weights->dimension(idx_width) !=
+-                              recurrent_weights->dimension(1));
+-  ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() != 1);
+-  ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(idx_width) != weights->dimension(idx_height));
+-  ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_width) != weights->dimension(idx_height));
+-  ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_height) != input->dimension(idx_height));
+-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(),
+-                                                     hidden_state->tensor_shape());
+-
+-  auto shape_info =
+-      TensorInfo(compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1,
+-                 input->data_type());
+-
+-  ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, weights, bias, &shape_info));
+-  ARM_COMPUTE_RETURN_ON_ERROR(
+-      CLGEMM::validate(hidden_state, recurrent_weights, nullptr, &shape_info, 1.f, 0.f));
+-  ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(
+-      ArithmeticOperation::ADD, &shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
+-  ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&shape_info, &shape_info, info));
+-
+-  return Status{};
+-}
+-
+-void CLRNNLayerEx::configure(const ICLTensor *input, const ICLTensor *weights,
+-                             const ICLTensor *recurrent_weights, const ICLTensor *bias,
+-                             ICLTensor *hidden_state, ICLTensor *output, ActivationLayerInfo &info)
+-{
+-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
+-  ARM_COMPUTE_ERROR_THROW_ON(CLRNNLayerEx::validate(input->info(), weights->info(),
+-                                                    recurrent_weights->info(), bias->info(),
+-                                                    hidden_state->info(), output->info(), info));
+-
+-  const int idx_height = 1;
+-  TensorShape shape =
+-      compute_rnn_shape(recurrent_weights->info(), hidden_state->info()->dimension(idx_height));
+-
+-  _is_prepared = false;
+-
+-  _fully_connected_out.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
+-  _gemm_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
+-
+-  // Manage intermediate buffers and configure
+-  _memory_group.manage(&_fully_connected_out);
+-  _fully_connected_kernel.configure(input, weights, bias, &_fully_connected_out);
+-
+-  _memory_group.manage(&_gemm_output);
+-  _gemm_state_f.configure(hidden_state, recurrent_weights, nullptr, &_gemm_output, 1.f, 0.f);
+-
+-  _add_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
+-  _memory_group.manage(&_add_output);
+-
+-  _add_kernel.configure(ArithmeticOperation::ADD, &_fully_connected_out, &_gemm_output,
+-                        &_add_output, ConvertPolicy::SATURATE);
+-
+-  _fully_connected_out.allocator()->allocate();
+-  _gemm_output.allocator()->allocate();
+-
+-  _activation_kernel.configure(&_add_output, hidden_state, info);
+-  _add_output.allocator()->allocate();
+-
+-  _copy_kernel.configure(hidden_state, output);
+-}
+-
+-void CLRNNLayerEx::run()
+-{
+-  prepare();
+-
+-  _memory_group.acquire();
+-
+-  _fully_connected_kernel.run();
+-  _gemm_state_f.run();
+-  CLScheduler::get().enqueue(_add_kernel);
+-  CLScheduler::get().enqueue(_activation_kernel);
+-
+-  // copy hidden out to output
+-  CLScheduler::get().enqueue(_copy_kernel);
+-
+-  _memory_group.release();
+-}
+-
+-void CLRNNLayerEx::prepare()
+-{
+-  if (!_is_prepared)
+-  {
+-    _fully_connected_kernel.prepare();
+-    _gemm_state_f.prepare();
+-
+-    _is_prepared = true;
+-  }
+-}
+diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
+index 8ce2d74..a41e6db 100644
+--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
++++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
+@@ -60,8 +60,7 @@ Status CLReduceOperation::validate(const ITensorInfo *input, const ITensorInfo *
+   const size_t num_of_interm_tensors = num_of_kernels - (keep_dims ? 1 : 0);
+ 
+   // Create temporary tensor infos
+-  auto interm_tensors =
+-      arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_interm_tensors);
++  auto interm_tensors = support::cpp14::make_unique<TensorInfo[]>(num_of_interm_tensors);
+ 
+   // Create intermediate tensor info
+   TensorShape shape{input->tensor_shape()};
+@@ -119,9 +118,8 @@ void CLReduceOperation::configure(ICLTensor *input, ICLTensor *output,
+   const size_t num_of_kernels = axis.size();
+   const size_t num_of_interm_tensors = num_of_kernels - (keep_dims ? 1 : 0);
+ 
+-  _interm_tensors = arm_compute::support::cpp14::make_unique<CLTensor[]>(num_of_interm_tensors);
+-  _reduce_kernels =
+-      arm_compute::support::cpp14::make_unique<CLReduceOperationKernel[]>(num_of_kernels);
++  _interm_tensors = support::cpp14::make_unique<CLTensor[]>(num_of_interm_tensors);
++  _reduce_kernels = support::cpp14::make_unique<CLReduceOperationKernel[]>(num_of_kernels);
+ 
+   // Set a vector that is ordered ICLTensors sequentially.
+   std::vector<ICLTensor *> tensors;
+diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp
+deleted file mode 100644
+index 7d7b226..0000000
+--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp
++++ /dev/null
+@@ -1,52 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2016-2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/runtime/CL/functions/CLSpaceToDepth.h"
+-
+-#include "arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h"
+-
+-using namespace arm_compute;
+-
+-void CLSpaceToDepth::configure(ICLTensor *input, ICLTensor *output, const int32_t block_size)
+-{
+-  auto k = arm_compute::support::cpp14::make_unique<CLSpaceToDepthKernel>();
+-  k->configure(input, output, block_size);
+-  _kernel = std::move(k);
+-}
+diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp
+index e61746e..3215d01 100644
+--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp
++++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp
+@@ -15,7 +15,7 @@
+  */
+ 
+ /*
+- * Copyright (c) 2017-2018 ARM Limited.
++ * Copyright (c) 2017-2020 ARM Limited.
+  *
+  * SPDX-License-Identifier: MIT
+  *
+@@ -37,218 +37,124 @@
+  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+  * SOFTWARE.
+  */
+-
+ #include "arm_compute/runtime/CL/functions/CLTransposeConvLayer.h"
+-#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
+ 
+-#include "arm_compute/core/Helpers.h"
+ #include "arm_compute/core/Utils.h"
+-#include "arm_compute/core/UtilsEx.h"
+ #include "arm_compute/core/Validate.h"
+ #include "arm_compute/core/utils/misc/ShapeCalculator.h"
++#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+ #include "arm_compute/runtime/CL/CLScheduler.h"
+-#include "arm_compute/runtime/CPP/CPPScheduler.h"
+ 
++#include <cmath>
+ #include <memory>
+ #include <tuple>
+ 
+ using namespace arm_compute;
+ using namespace arm_compute::misc::shape_calculator;
+ 
+-CLTransposeConvLayer::CLTransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+-    : _memory_group(std::move(memory_manager)),
+-      _scale_f(),
+-      _conv_f(),
+-      _flip_weights(),
+-      _scaled_output(),
+-      _original_weights(nullptr),
+-      _weights_flipped(),
+-      _is_prepared(false)
++CLTransposeConvLayer::CLTransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager)
++    : _memory_manager(std::move(memory_manager)), _function()
++{
++}
++
++void CLTransposeConvLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias,
++                                     ICLTensor *output, const PadStrideInfo &deconv_info,
++                                     unsigned int invalid_right, unsigned int invalid_bottom,
++                                     const WeightsInfo &weights_info)
+ {
++  configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, deconv_info,
++            invalid_right, invalid_bottom, weights_info);
++}
++
++void CLTransposeConvLayer::configure(const CLCompileContext &compile_context, ICLTensor *input,
++                                     ICLTensor *weights, const ICLTensor *bias, ICLTensor *output,
++                                     const PadStrideInfo &deconv_info, unsigned int invalid_right,
++                                     unsigned int invalid_bottom, const WeightsInfo &weights_info)
++{
++  ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
++
++  switch (CLTransposeConvLayer::get_deconvolution_method(input->info(), weights->info(), nullptr,
++                                                         output->info(), deconv_info, invalid_right,
++                                                         invalid_bottom, weights_info))
++  {
++    case DeconvolutionMethod::DIRECT:
++    {
++      auto f = arm_compute::support::cpp14::make_unique<CLDirectTransposeConvLayer>();
++      f->configure(compile_context, input, weights, bias, output, deconv_info, invalid_right,
++                   invalid_bottom, weights_info);
++      _function = std::move(f);
++      break;
++    }
++    case DeconvolutionMethod::GEMM:
++    {
++      auto f = arm_compute::support::cpp14::make_unique<CLGEMMDeconvolutionLayer>(_memory_manager);
++      f->configure(compile_context, input, weights, bias, output, deconv_info);
++      _function = std::move(f);
++      break;
++    }
++    default:
++      ARM_COMPUTE_ERROR("Not supported.");
++      break;
++  }
+ }
+ 
+ Status CLTransposeConvLayer::validate(const ITensorInfo *input, const ITensorInfo *weights,
+                                       const ITensorInfo *bias, ITensorInfo *output,
+-                                      const PadStrideInfo &info, unsigned int invalid_right,
++                                      const PadStrideInfo &deconv_info, unsigned int invalid_right,
+                                       unsigned int invalid_bottom, const WeightsInfo &weights_info)
+ {
+   ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
+-                                                       DataType::F32);
+-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
+-
+-  const DataLayout data_layout = input->data_layout();
+-
+-  const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+-  const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+-  const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+-
+-  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) != weights->dimension(idx_h));
+-  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) < 1);
+-
+-  const unsigned int kernel_x = weights->dimension(idx_w);
+-  const unsigned int kernel_y = weights->dimension(idx_h);
+-
+-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(invalid_right > kernel_x - 1,
+-                                  "invalid_right must be smaller than kernel_x");
+-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(invalid_bottom > kernel_y - 1,
+-                                  "inner_border_top must be smaller than kernel_y");
+-
+-  // NOTE From the existing CLDeconvolutionLayer, invalid_right and invalid_bottom were added.
+-  auto out_dims = transposeconv_output_dimensions(
+-      input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w),
+-      weights->dimension(idx_h), info, invalid_right, invalid_bottom);
+-
+-  const TensorShape output_shape = compute_transposeconv_output_shape(out_dims, *input, *weights);
+-
+-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights);
+-
+-  if (bias != nullptr)
++  switch (CLTransposeConvLayer::get_deconvolution_method(
++      input, weights, bias, output, deconv_info, invalid_right, invalid_bottom, weights_info))
+   {
+-    if (is_data_type_quantized_asymmetric(input->data_type()))
++    case DeconvolutionMethod::DIRECT:
+     {
+-      ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
++      // Validate direct convolution layer
++      ARM_COMPUTE_RETURN_ON_ERROR(CLDirectTransposeConvLayer::validate(
++          input, weights, bias, output, deconv_info, invalid_right, invalid_bottom, weights_info));
++      break;
+     }
+-    else
++    case DeconvolutionMethod::GEMM:
+     {
+-      ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
++      // Validate gemm-based convolution layer
++      ARM_COMPUTE_RETURN_ON_ERROR(
++          CLGEMMDeconvolutionLayer::validate(input, weights, bias, output, deconv_info));
++      break;
+     }
+-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, bias);
++    default:
++      ARM_COMPUTE_ERROR("Not supported.");
++      break;
+   }
+ 
+-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_w) != output_shape[idx_w],
+-                                  "Output's width is invalid.");
+-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_h) != output_shape[idx_h],
+-                                  "Output's height is invalid.");
+-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_c) != output_shape[idx_c],
+-                                  "Output's depth is invalid.");
+-
+-  unsigned int pad_left = 0;
+-  unsigned int pad_right = 0;
+-  unsigned int pad_top = 0;
+-  unsigned int pad_bottom = 0;
+-  const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
+-      *input, *weights, info, out_dims, invalid_right, invalid_bottom, pad_left, pad_right, pad_top,
+-      pad_bottom);
+-  TensorInfo scale_out_info(input->clone()
+-                                ->set_is_resizable(true)
+-                                .reset_padding()
+-                                .set_tensor_shape(scale_out_shape)
+-                                .set_data_layout(data_layout));
+-  const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+-
+-  ARM_COMPUTE_RETURN_ON_ERROR(
+-      CLTransposeConvLayerUpsample::validate(input, &scale_out_info, BorderSize(0, 0), info));
+-  ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output,
+-                                                           conv_info, weights_info));
+-
+   return Status{};
+ }
+ 
+-void CLTransposeConvLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias,
+-                                     ICLTensor *output, const PadStrideInfo &info,
+-                                     unsigned int invalid_right, unsigned int invalid_bottom,
+-                                     const WeightsInfo &weights_info)
++DeconvolutionMethod CLTransposeConvLayer::get_deconvolution_method(
++    const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias,
++    ITensorInfo *output, const PadStrideInfo &deconv_info, unsigned int invalid_right,
++    unsigned int invalid_bottom, const WeightsInfo &weights_info)
+ {
+-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+-
+-  const unsigned int stride_x = info.stride().first;
+-  const unsigned int stride_y = info.stride().second;
++  ARM_COMPUTE_UNUSED(output, bias, weights_info);
+ 
+-  const DataLayout data_layout = input->info()->data_layout();
++  const DataLayout data_layout = input->data_layout();
+ 
+   const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+   const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ 
+-  _original_weights = weights;
+-  _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
+-  _flip_weights.configure(weights, &_weights_flipped);
+-
+-  // NOTE From the existing CLDeconvolutionLayer, invalid_right and invalid_bottom were
+-  // added.
+-  auto out_dims = transposeconv_output_dimensions(
+-      input->info()->dimension(idx_w), input->info()->dimension(idx_h),
+-      weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info, invalid_right,
+-      invalid_bottom);
+-
+-  const TensorShape output_shape =
+-      compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info());
+-
+-  // Output auto initialization if not yet initialized
+-  auto_init_if_empty(
+-      *output->info(),
+-      input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout));
+-
+-  // Perform validation step
+-  ARM_COMPUTE_ERROR_THROW_ON(CLTransposeConvLayer::validate(
+-      input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(),
+-      info, invalid_right, invalid_bottom));
+-
+-  _is_prepared = weights_info.retain_internal_weights();
+-
+-  _memory_group.manage(&_scaled_output);
+-
+-  // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order
+-  // to match output shape
+-  unsigned int pad_left = 0;
+-  unsigned int pad_right = 0;
+-  unsigned int pad_top = 0;
+-  unsigned int pad_bottom = 0;
+-  const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
+-      *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left,
+-      pad_right, pad_top, pad_bottom);
+-
+-  TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(),
+-                            input->info()->quantization_info());
+-  scale_out_info.set_data_layout(data_layout);
+-  _scaled_output.allocator()->init(scale_out_info);
+-
+-  // configure scale function
+-  const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom,
+-                                    DimensionRoundingType::FLOOR);
+-  _scale_f.configure(input, &_scaled_output, BorderSize(0, 0), upsample_info);
+-
+-  // setup the function to convolve the upscaled output
+-  const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+-  _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info, weights_info);
+-  _scaled_output.allocator()->allocate();
++  if (weights->dimension(idx_w) != deconv_info.stride().first ||
++      weights->dimension(idx_h) != deconv_info.stride().second || invalid_right != 0 ||
++      invalid_bottom != 0)
++  {
++    return DeconvolutionMethod::DIRECT;
++  }
++
++  return DeconvolutionMethod::GEMM;
+ }
+ 
+ void CLTransposeConvLayer::run()
+ {
+   prepare();
+-
+-  _memory_group.acquire();
+-
+-  _scale_f.run();
+-  _conv_f.run();
+-
+-  _memory_group.release();
++  _function->run();
+ }
+ 
+-void CLTransposeConvLayer::prepare()
+-{
+-  if (!_is_prepared)
+-  {
+-    ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+-
+-    // Run weights flipping and mark original weights tensor as unused
+-    _weights_flipped.allocator()->allocate();
+-    _weights_flipped.map(true);
+-    _original_weights->map(CLScheduler::get().queue(), true);
+-    CPPScheduler::get().schedule(&_flip_weights, Window::DimZ);
+-    _weights_flipped.unmap();
+-    _original_weights->unmap(CLScheduler::get().queue());
+-    _original_weights->mark_as_unused();
+-
+-    // Prepare convolution
+-    _conv_f.prepare();
+-
+-    if (!_weights_flipped.is_used())
+-    {
+-      _weights_flipped.allocator()->free();
+-    }
+-
+-    _is_prepared = true;
+-  }
+-}
++void CLTransposeConvLayer::prepare() { _function->prepare(); }
+diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayerUpsample.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayerUpsample.cpp
+deleted file mode 100644
+index 07feb5a..0000000
+--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayerUpsample.cpp
++++ /dev/null
+@@ -1,92 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2017-2018 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h"
+-
+-#include "arm_compute/core/CL/OpenCL.h"
+-#include "arm_compute/core/Utils.h"
+-#include "arm_compute/runtime/CL/CLScheduler.h"
+-#include "arm_compute/core/CL/ICLTensor.h"
+-
+-#include <cmath>
+-#include <memory>
+-#include <tuple>
+-
+-using namespace arm_compute;
+-
+-CLTransposeConvLayerUpsample::CLTransposeConvLayerUpsample() // NOLINT
+-    : _upsample(),
+-      _output(nullptr)
+-{
+-}
+-
+-Status CLTransposeConvLayerUpsample::validate(const ITensorInfo *input, const ITensorInfo *output,
+-                                              const BorderSize &inner_border,
+-                                              const PadStrideInfo &info)
+-{
+-  return CLTransposeConvLayerUpsampleKernel::validate(input, output, inner_border, info);
+-}
+-
+-void CLTransposeConvLayerUpsample::configure(ICLTensor *input, ICLTensor *output,
+-                                             const BorderSize &inner_border,
+-                                             const PadStrideInfo &info)
+-{
+-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+-
+-  _output = output;
+-  _upsample.configure(input, _output, inner_border, info);
+-}
+-
+-void CLTransposeConvLayerUpsample::run()
+-{
+-  _output->map(CLScheduler::get().queue(), true);
+-  if (is_data_type_quantized_asymmetric(_output->info()->data_type()))
+-  {
+-    const uint8_t quantized_zero = _output->info()->quantization_info().uniform().offset;
+-    std::fill_n(_output->buffer(), _output->info()->total_size(), quantized_zero);
+-  }
+-  else
+-  {
+-    memset(_output->buffer(), 0, _output->info()->total_size());
+-  }
+-  _output->unmap(CLScheduler::get().queue());
+-
+-  CLScheduler::get().enqueue(_upsample, false);
+-}
+diff --git a/compute/ARMComputeEx/src/runtime/CPP/functions/CPPOneHotEx.cpp b/compute/ARMComputeEx/src/runtime/CPP/functions/CPPOneHotEx.cpp
+index 114e1a7..768c15b 100644
+--- a/compute/ARMComputeEx/src/runtime/CPP/functions/CPPOneHotEx.cpp
++++ b/compute/ARMComputeEx/src/runtime/CPP/functions/CPPOneHotEx.cpp
+@@ -41,14 +41,14 @@
+ #include "arm_compute/runtime/CPP/functions/CPPOneHotEx.h"
+ 
+ #include "arm_compute/core/CPP/kernels/CPPOneHotKernelEx.h"
+-#include "support/ToolchainSupport.h"
++#include "support/MemorySupport.h"
+ 
+ using namespace arm_compute;
+ 
+ void CPPOneHotEx::configure(const ITensor *indices, const ITensor *depth, const ITensor *on_value,
+                             const ITensor *off_value, ITensor *output, const int axis)
+ {
+-  auto k = arm_compute::support::cpp14::make_unique<CPPOneHotKernelEx>();
++  auto k = support::cpp14::make_unique<CPPOneHotKernelEx>();
+   k->configure(indices, depth, on_value, off_value, output, axis);
+   _kernel = std::move(k);
+ }
+diff --git a/compute/ARMComputeEx/src/runtime/CPP/functions/CPPUpsampleEx.cpp b/compute/ARMComputeEx/src/runtime/CPP/functions/CPPUpsampleEx.cpp
+deleted file mode 100644
+index 6c90ef3..0000000
+--- a/compute/ARMComputeEx/src/runtime/CPP/functions/CPPUpsampleEx.cpp
++++ /dev/null
+@@ -1,53 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2017-2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/runtime/CPP/functions/CPPUpsampleEx.h"
+-
+-#include "arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h"
+-#include "support/ToolchainSupport.h"
+-
+-using namespace arm_compute;
+-
+-void CPPUpsampleEx::configure(const ITensor *input, ITensor *output, const PadStrideInfo &info)
+-{
+-  auto k = arm_compute::support::cpp14::make_unique<CPPUpsampleKernelEx>();
+-  k->configure(input, output, info);
+-  _kernel = std::move(k);
+-}
+diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEActivationLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEActivationLayerEx.cpp
+index ff81ff8..2752eb6 100644
+--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEActivationLayerEx.cpp
++++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEActivationLayerEx.cpp
+@@ -42,7 +42,7 @@
+ 
+ #include "arm_compute/core/NEON/kernels/NEActivationLayerKernelEx.h"
+ #include "arm_compute/runtime/IRuntimeContext.h"
+-#include "support/ToolchainSupport.h"
++#include "support/MemorySupport.h"
+ 
+ namespace arm_compute
+ {
+@@ -53,7 +53,7 @@ NEActivationLayerEx::NEActivationLayerEx(IRuntimeContext *ctx) // NOLINT
+ void NEActivationLayerEx::configure(ITensor *input, ITensor *output,
+                                     ActivationLayerInfo activation_info)
+ {
+-  auto k = arm_compute::support::cpp14::make_unique<NEActivationLayerKernelEx>();
++  auto k = support::cpp14::make_unique<NEActivationLayerKernelEx>();
+   k->configure(input, output, activation_info);
+   _kernel = std::move(k);
+ }
+diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp
+index e42c453..2fc94b2 100644
+--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp
++++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp
+@@ -42,7 +42,7 @@
+ #include <arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h>
+ 
+ #include "arm_compute/core/ITensor.h"
+-#include "support/ToolchainSupport.h"
++#include "support/MemorySupport.h"
+ 
+ #include <utility>
+ 
+@@ -53,7 +53,7 @@ template <BinaryLogicalOperation COP>
+ void NEBinaryLogicalOperationStatic<COP>::configure(ITensor *input1, ITensor *input2,
+                                                     ITensor *output)
+ {
+-  auto k = arm_compute::support::cpp14::make_unique<NEBinaryLogicalOperationKernel>();
++  auto k = support::cpp14::make_unique<NEBinaryLogicalOperationKernel>();
+   k->configure(COP, input1, input2, output);
+   _kernel = std::move(k);
+ }
+@@ -69,7 +69,7 @@ Status NEBinaryLogicalOperationStatic<COP>::validate(const ITensorInfo *input1,
+ void NEBinaryLogicalOperation::configure(ITensor *input1, ITensor *input2, ITensor *output,
+                                          BinaryLogicalOperation op)
+ {
+-  auto k = arm_compute::support::cpp14::make_unique<NEBinaryLogicalOperationKernel>();
++  auto k = support::cpp14::make_unique<NEBinaryLogicalOperationKernel>();
+   k->configure(op, input1, input2, output);
+   _kernel = std::move(k);
+ }
+diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NECast.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NECast.cpp
+deleted file mode 100644
+index dc5c620..0000000
+--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NECast.cpp
++++ /dev/null
+@@ -1,60 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2017-2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/runtime/NEON/functions/NECast.h"
+-
+-#include "arm_compute/core/NEON/kernels/NECastKernel.h"
+-#include "support/ToolchainSupport.h"
+-
+-namespace arm_compute
+-{
+-void NECast::configure(const ITensor *input, ITensor *output, SubDataType input_subtype)
+-{
+-  auto k = arm_compute::support::cpp14::make_unique<NECastKernel>();
+-  k->configure(input, output, input_subtype);
+-  _kernel = std::move(k);
+-}
+-
+-Status NECast::validate(const ITensorInfo *input, const ITensorInfo *output,
+-                        SubDataType input_subtype)
+-{
+-  return NECastKernel::validate(input, output, input_subtype);
+-}
+-} // namespace arm_compute
+diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEDepthToSpaceLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEDepthToSpaceLayerEx.cpp
+deleted file mode 100644
+index 5ec0b86..0000000
+--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEDepthToSpaceLayerEx.cpp
++++ /dev/null
+@@ -1,63 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h"
+-
+-#include "arm_compute/core/Error.h"
+-#include "arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h"
+-#include "arm_compute/core/TensorInfo.h"
+-#include "arm_compute/core/Types.h"
+-#include "arm_compute/core/Validate.h"
+-
+-namespace arm_compute
+-{
+-void NEDepthToSpaceLayerEx::configure(const ITensor *input, ITensor *output, int32_t block_shape)
+-{
+-  auto k = arm_compute::support::cpp14::make_unique<NEDepthToSpaceLayerKernelEx>();
+-  k->configure(input, output, block_shape);
+-  _kernel = std::move(k);
+-}
+-
+-Status NEDepthToSpaceLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+-                                       int32_t block_shape)
+-{
+-  return NEDepthToSpaceLayerKernelEx::validate(input, output, block_shape);
+-}
+-} // namespace arm_compute
+diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp
+index 53fb150..e0ab3e0 100644
+--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp
++++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp
+@@ -41,13 +41,13 @@
+ #include "arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h"
+ 
+ #include "arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h"
+-#include "support/ToolchainSupport.h"
++#include "support/MemorySupport.h"
+ 
+ using namespace arm_compute;
+ 
+ void NEEmbeddingLookup::configure(const ITensor *input, ITensor *output, const ITensor *lookups)
+ {
+-  auto k = arm_compute::support::cpp14::make_unique<NEEmbeddingLookupKernel>();
++  auto k = support::cpp14::make_unique<NEEmbeddingLookupKernel>();
+   k->configure(input, output, lookups);
+   _kernel = std::move(k);
+ }
+diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp
+index f457732..a123439 100644
+--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp
++++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp
+@@ -58,7 +58,7 @@ namespace
+ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output)
+ {
+   ARM_COMPUTE_RETURN_ON_ERROR(
+-      NEGEMMLowpMatrixMultiplyCoreEx::validate(&input, &weights, nullptr, &output));
++      NEGEMMLowpMatrixMultiplyCore::validate(&input, &weights, nullptr, &output));
+ 
+   return Status{};
+ }
+@@ -66,7 +66,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
+ 
+ void NEFullyConnectedHybridLayerReshapeWeights::configure(const ITensor *input, ITensor *output)
+ {
+-  auto k = arm_compute::support::cpp14::make_unique<NETransposeKernel>();
++  auto k = support::cpp14::make_unique<NETransposeKernel>();
+   k->configure(input, output);
+   _kernel = std::move(k);
+ }
+@@ -158,7 +158,8 @@ void NEFullyConnectedHybridLayer::configure(const ITensor *input, const ITensor
+ 
+   // Quantize input
+   _quantized_input.allocator()->init(
+-      input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S8));
++      input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(
++          DataType::QASYMM8_SIGNED));
+   _scale_factor.allocator()->init(
+       TensorInfo(TensorShape{output->info()->dimension(1)}, 1, DataType::F32));
+   _quant_input_kernel.configure(input, &_quantized_input, &_scale_factor);
+@@ -186,7 +187,7 @@ Status NEFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe
+   ARM_COMPUTE_UNUSED(fc_info.retain_internal_weights);
+   ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+   ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::S8);
++  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8_SIGNED);
+   ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+   ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
+   ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 2);
+@@ -224,8 +225,9 @@ Status NEFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe
+   ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1));
+ 
+   // Validate quantization kernel
+-  const ITensorInfo &quantized_input = TensorInfo(
+-      input->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S8));
++  const ITensorInfo &quantized_input =
++      TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_data_type(
++          DataType::QASYMM8_SIGNED));
+   const ITensorInfo &scale_factor = TensorInfo(TensorShape{output->dimension(1)}, 1, DataType::F32);
+   ARM_COMPUTE_RETURN_ON_ERROR(
+       NEQuantizationSymmetricKernel::validate(input, &quantized_input, &scale_factor));
+diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp
+index fcac3c7..dc6c784 100644
+--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp
++++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp
+@@ -56,12 +56,17 @@ void NEFullyConnectedReshapingLayer::configure(const arm_compute::ITensor *input
+       assert(kernel_type == KernelType::PREPROCESSED_WEIGHTS);
+ 
+       bool is_hybrid = input->info()->data_type() == DataType::F32 &&
+-                       weights->info()->data_type() == DataType::S8;
++                       (weights->info()->data_type() == DataType::S8 ||
++                        weights->info()->data_type() == DataType::QASYMM8_SIGNED);
+ 
+       if (is_hybrid)
+       {
+         auto fc = new arm_compute::NEFullyConnectedHybridLayer{_memory_manager};
++        ITensorInfo *weights_info = const_cast<ITensorInfo *>(_weights->info());
++        const auto orgin_weights_data_type = weights_info->data_type();
++        weights_info->set_data_type(DataType::QASYMM8_SIGNED);
+         fc->configure(input_to_use, _weights, _biases, _output);
++        weights_info->set_data_type(orgin_weights_data_type);
+         return std::unique_ptr<arm_compute::IFunction>(fc);
+       }
+       else
+diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.cpp
+deleted file mode 100644
+index 1290cfd..0000000
+--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.cpp
++++ /dev/null
+@@ -1,513 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2017-2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h"
+-
+-#include "arm_compute/core/Error.h"
+-#include "arm_compute/core/Helpers.h"
+-#include "arm_compute/core/ITensor.h"
+-#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+-#include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
+-#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+-#include "arm_compute/core/TensorInfo.h"
+-#include "arm_compute/core/Types.h"
+-#include "arm_compute/core/Validate.h"
+-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+-#include "arm_compute/runtime/NEON/NEScheduler.h"
+-#include "arm_compute/runtime/TensorAllocator.h"
+-#include "support/ToolchainSupport.h"
+-
+-using namespace arm_compute;
+-using namespace arm_compute::misc::shape_calculator;
+-
+-NEGEMMLowpMatrixMultiplyCoreEx::NEGEMMLowpMatrixMultiplyCoreEx(
+-    std::shared_ptr<IMemoryManager> memory_manager)
+-    : _memory_group(memory_manager), _asm_glue(memory_manager), _mm_kernel(nullptr),
+-      _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _mtx_a_reduction_kernel(),
+-      _mtx_b_reduction_kernel(), _offset_contribution_kernel(),
+-      _offset_contribution_output_stage_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(),
+-      _tmp_b(), _mm_result_s32(), _signed_a(), _signed_output(), _original_b(nullptr), _a_offset(0),
+-      _b_offset(0), _run_vector_matrix_multiplication(false), _assembly_path(false),
+-      _fused_assembly_path(false), _reshape_b_only_on_first_run(false), _is_prepared(false),
+-      _fuse_output_stage(false), _flip_signedness(false)
+-{
+-}
+-
+-void NEGEMMLowpMatrixMultiplyCoreEx::configure(const ITensor *a, const ITensor *b, const ITensor *c,
+-                                               ITensor *output, const GEMMInfo &gemm_info)
+-{
+-  ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
+-  ARM_COMPUTE_UNUSED(c);
+-  ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpMatrixMultiplyCoreEx::validate(
+-      a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info));
+-
+-  const ITensor *matrix_a = a;
+-  const ITensor *matrix_b = b;
+-  GEMMInfo info = gemm_info;
+-
+-  // Clear state
+-  _mtx_a_reshape_kernel = nullptr;
+-  _mtx_b_reshape_kernel = nullptr;
+-
+-  // Set internal variables
+-  _a_offset = a->info()->quantization_info().uniform().offset;
+-  _b_offset = b->info()->quantization_info().uniform().offset;
+-  _run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
+-  _reshape_b_only_on_first_run = info.reshape_b_only_on_first_run();
+-  _is_prepared = false;
+-  _fused_assembly_path = false;
+-  _original_b = b;
+-
+-  const ITensor *a_to_use = a;
+-
+-  // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
+-  if (info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
+-  {
+-    _fuse_output_stage = true;
+-    _memory_group.manage(&_mm_result_s32);
+-    TensorInfo info_mm_result_s32(output->info()->tensor_shape(), 1, DataType::S32);
+-    _mm_result_s32.allocator()->init(info_mm_result_s32);
+-  }
+-
+-#ifdef __aarch64__
+-  switch (a->info()->data_type())
+-  {
+-    case DataType::QASYMM8:
+-    case DataType::QASYMM8_SIGNED:
+-    case DataType::U8:
+-    case DataType::S8:
+-    {
+-      if (a_to_use->info()->data_type() == DataType::QASYMM8 &&
+-          info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+-      {
+-        _asm_glue.configure(a_to_use, b, c, output, gemm_info);
+-        _fused_assembly_path = _asm_glue.is_configured();
+-      }
+-      else
+-      {
+-        _asm_glue.configure(a_to_use, b, nullptr, _fuse_output_stage ? &_mm_result_s32 : output,
+-                            gemm_info);
+-      }
+-      _assembly_path = _asm_glue.is_configured();
+-      break;
+-    }
+-    default:
+-    {
+-      ARM_COMPUTE_ERROR("Datatype not supported");
+-      break;
+-    }
+-  }
+-#endif /* __aarch64__ */
+-  if (!(_assembly_path || _run_vector_matrix_multiplication))
+-  {
+-    matrix_a = &_tmp_a;
+-    matrix_b = &_tmp_b;
+-
+-    // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width /
+-    // 4.0f) ]
+-    TensorInfo a_info(compute_interleaved_shape(*a_to_use->info()), 1,
+-                      a_to_use->info()->data_type(), a_to_use->info()->quantization_info());
+-    // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width /
+-    // 16.0f) ]
+-    TensorInfo b_info(compute_transpose1xW_shape(*b->info()), 1, b->info()->data_type(),
+-                      b->info()->quantization_info());
+-    _tmp_a.allocator()->init(a_info);
+-    _tmp_b.allocator()->init(b_info);
+-    _memory_group.manage(&_tmp_a);
+-    if (!_reshape_b_only_on_first_run)
+-    {
+-      _memory_group.manage(&_tmp_b);
+-    }
+-
+-    // Configure interleave kernel
+-    {
+-      auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
+-      k->configure(a_to_use, &_tmp_a);
+-      _mtx_a_reshape_kernel = std::move(k);
+-    }
+-
+-    // Configure transpose kernel
+-    {
+-      auto k = arm_compute::support::cpp14::make_unique<NEGEMMTranspose1xWKernel>();
+-      k->configure(b, &_tmp_b);
+-      _mtx_b_reshape_kernel = std::move(k);
+-    }
+-  }
+-
+-  if (!_fused_assembly_path)
+-  {
+-    // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
+-    if (_a_offset != 0)
+-    {
+-      TensorInfo info_vector_sum_col(compute_reductionA_shape(*b->info()), 1, DataType::S32);
+-
+-      _vector_sum_col.allocator()->init(info_vector_sum_col);
+-      if (!_reshape_b_only_on_first_run)
+-      {
+-        _memory_group.manage(&_vector_sum_col);
+-      }
+-
+-      // Configure Matrix B reduction kernel
+-      _mtx_b_reduction_kernel.configure(b, &_vector_sum_col, a_to_use->info()->dimension(0), false);
+-    }
+-
+-    // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
+-    if (_b_offset != 0)
+-    {
+-      TensorInfo info_vector_sum_row(compute_reductionB_shape(*a_to_use->info()), 1, DataType::S32);
+-
+-      _vector_sum_row.allocator()->init(info_vector_sum_row);
+-      _memory_group.manage(&_vector_sum_row);
+-
+-      // Configure matrix A reduction kernel
+-      _mtx_a_reduction_kernel.configure(a_to_use, &_vector_sum_row, a_to_use->info()->dimension(0),
+-                                        false);
+-    }
+-
+-    if (_fuse_output_stage)
+-    {
+-      // Configure matrix multiply kernel
+-      if (!_assembly_path)
+-      {
+-        auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
+-        k->configure(matrix_a, matrix_b, &_mm_result_s32);
+-        _mm_kernel = std::move(k);
+-      }
+-
+-      _offset_contribution_output_stage_kernel.configure(
+-          &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col,
+-          _b_offset == 0 ? nullptr : &_vector_sum_row, c,
+-          _flip_signedness ? &_signed_output : output, a->info()->dimension(0), _a_offset,
+-          _b_offset, info.gemmlowp_output_stage());
+-    }
+-    else
+-    {
+-      // Configure matrix multiply kernel
+-      if (!_assembly_path)
+-      {
+-        auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
+-        k->configure(matrix_a, matrix_b, output);
+-        _mm_kernel = std::move(k);
+-      }
+-      // Configure offset contribution kernel
+-      _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col,
+-                                            _b_offset == 0 ? nullptr : &_vector_sum_row,
+-                                            a_to_use->info()->dimension(0), _a_offset, _b_offset);
+-    }
+-  }
+-
+-  // Allocate tensors
+-  if (!_assembly_path && !_run_vector_matrix_multiplication)
+-  {
+-    _tmp_a.allocator()->allocate();
+-    if (!_reshape_b_only_on_first_run)
+-    {
+-      _tmp_b.allocator()->allocate();
+-    }
+-  }
+-
+-  if (!_fused_assembly_path)
+-  {
+-    if (_a_offset != 0 && !_reshape_b_only_on_first_run)
+-    {
+-      _vector_sum_col.allocator()->allocate();
+-    }
+-
+-    if (_b_offset != 0)
+-    {
+-      _vector_sum_row.allocator()->allocate();
+-    }
+-  }
+-
+-  if (_fuse_output_stage)
+-  {
+-    _mm_result_s32.allocator()->allocate();
+-  }
+-}
+-
+-Status NEGEMMLowpMatrixMultiplyCoreEx::validate(const ITensorInfo *a, const ITensorInfo *b,
+-                                                const ITensorInfo *c, const ITensorInfo *output,
+-                                                const GEMMInfo &gemm_info)
+-{
+-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::S8);
+-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::S8);
+-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+-      c != nullptr && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE,
+-      "Bias addition not supported in NEGEMMLowpMatrixMultiplyCoreEx for output S32");
+-  ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1),
+-                                  "The product AB is defined only if the number of columns in A is "
+-                                  "equal to the number of rows in B");
+-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(),
+-                                  "Matrix A already reshaped is not supported");
+-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(),
+-                                  "Matrix B already reshaped is not supported");
+-
+-  GEMMInfo info = gemm_info;
+-  const ITensorInfo *matrix_a_info = a;
+-  const ITensorInfo *matrix_b_info = b;
+-
+-  const ITensorInfo *a_to_use = a;
+-
+-  TensorInfo tmp_a_info{};
+-  TensorInfo tmp_b_info{};
+-  TensorInfo mm_result_s32_info{};
+-
+-  int32_t a_offset = a->quantization_info().uniform().offset;
+-  int32_t b_offset = b->quantization_info().uniform().offset;
+-
+-  bool fuse_output_stage = info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE;
+-  if (fuse_output_stage)
+-  {
+-    auto_init_if_empty(
+-        mm_result_s32_info,
+-        a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32));
+-  }
+-
+-  // Check if we need to run the optimized assembly kernel
+-  bool run_optimised = false;
+-  bool run_optimised_requantized = false;
+-  if (a_to_use->data_type() == DataType::QASYMM8 &&
+-      info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+-  {
+-    run_optimised = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, c, output, gemm_info));
+-    run_optimised_requantized = run_optimised;
+-  }
+-  else
+-  {
+-    run_optimised = bool(NEGEMMAssemblyDispatch::validate(
+-        a_to_use, b, c, fuse_output_stage ? &mm_result_s32_info : output, gemm_info));
+-  }
+-
+-  if (run_optimised)
+-  {
+-    ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0));
+-    if (info.depth_output_gemm3d() != 0)
+-    {
+-      if (info.reinterpret_input_as_3d())
+-      {
+-        ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
+-        ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2));
+-      }
+-      else
+-      {
+-        ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1) * output->dimension(2));
+-      }
+-    }
+-    else
+-    {
+-      ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
+-    }
+-  }
+-  else
+-  {
+-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(),
+-                                    "NEGEMM cannot reinterpret the input tensor as 3D");
+-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0,
+-                                    "NEGEMM cannot reinterpret the output tensor as 3D");
+-
+-    const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
+-    if (!run_vector_matrix_multiplication)
+-    {
+-      matrix_a_info = &tmp_a_info;
+-      matrix_b_info = &tmp_b_info;
+-
+-      // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width /
+-      // 4.0f) ]
+-      TensorShape shape_tmp_a = a->tensor_shape();
+-      shape_tmp_a.set(0, a->dimension(0) * 4);
+-      shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));
+-
+-      // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width
+-      // / 16.0f) ]
+-      TensorShape shape_tmp_b = b->tensor_shape();
+-      shape_tmp_b.set(0, b->dimension(1) * 16);
+-      shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
+-
+-      // Validate interleave kernel
+-      auto_init_if_empty(tmp_a_info, a_to_use->clone()->set_tensor_shape(shape_tmp_a));
+-      auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(shape_tmp_b));
+-
+-      ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a_to_use, &tmp_a_info));
+-      ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &tmp_b_info));
+-    }
+-  }
+-
+-  if (!run_optimised_requantized)
+-  {
+-    TensorInfo info_vector_sum_col{};
+-    TensorInfo info_vector_sum_row{};
+-
+-    // Validate matrix B reduction kernel only if _a_offset is not equal to 0
+-    if (a_offset != 0)
+-    {
+-      info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
+-
+-      // Configure Matrix B reduction kernel
+-      ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixBReductionKernel::validate(
+-          b, &info_vector_sum_col, a->dimension(0), false));
+-    }
+-
+-    // Validate Matrix A reduction kernel only if _b_offset is not equal to 0
+-    if (b_offset != 0)
+-    {
+-      info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
+-
+-      // Configure matrix A reduction kernel
+-      ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(
+-          a_to_use, &info_vector_sum_row, a->dimension(0), false));
+-    }
+-
+-    if (fuse_output_stage)
+-    {
+-      if (!run_optimised)
+-      {
+-        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(
+-            matrix_a_info, matrix_b_info, &mm_result_s32_info));
+-      }
+-
+-      // Validate offset contribution kernel
+-      ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionOutputStageKernel::validate(
+-          &mm_result_s32_info, a_offset == 0 ? nullptr : &info_vector_sum_col,
+-          b_offset == 0 ? nullptr : &info_vector_sum_row, c, output, a_offset, b_offset,
+-          info.gemmlowp_output_stage()));
+-    }
+-    else
+-    {
+-      if (!run_optimised)
+-      {
+-        ARM_COMPUTE_RETURN_ON_ERROR(
+-            NEGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output));
+-      }
+-      // Validate offset contribution kernel
+-      ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionKernel::validate(
+-          output, a_offset == 0 ? nullptr : &info_vector_sum_col,
+-          b_offset == 0 ? nullptr : &info_vector_sum_row, a_offset, b_offset));
+-    }
+-  }
+-  return Status{};
+-}
+-
+-void NEGEMMLowpMatrixMultiplyCoreEx::run()
+-{
+-  prepare();
+-
+-  MemoryGroupResourceScope scope_mg(_memory_group);
+-
+-  // Reshape inputs
+-  if (_mtx_a_reshape_kernel)
+-  {
+-    NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY);
+-  }
+-  if (_mtx_b_reshape_kernel && !_reshape_b_only_on_first_run)
+-  {
+-    NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
+-  }
+-
+-  // Run GEMM
+-  if (_asm_glue.is_configured())
+-  {
+-    _asm_glue.run();
+-  }
+-  else
+-  {
+-    NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);
+-  }
+-
+-  if (!_fused_assembly_path)
+-  {
+-    // Run matrix A reduction kernel only if _b_offset is not equal to 0
+-    if (_b_offset != 0)
+-    {
+-      NEScheduler::get().schedule(&_mtx_a_reduction_kernel, Window::DimX);
+-    }
+-
+-    // Run matrix B reduction kernel only if _a_offset is not equal to 0
+-    if (_a_offset != 0 && !_reshape_b_only_on_first_run)
+-    {
+-      NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);
+-    }
+-
+-    if (_fuse_output_stage)
+-    {
+-      // Run offset contribution kernel
+-      NEScheduler::get().schedule(&_offset_contribution_output_stage_kernel, Window::DimY);
+-    }
+-    else
+-    {
+-      // Run offset contribution kernel
+-      NEScheduler::get().schedule(&_offset_contribution_kernel, Window::DimY);
+-    }
+-  }
+-}
+-
+-void NEGEMMLowpMatrixMultiplyCoreEx::prepare()
+-{
+-  if (!_is_prepared)
+-  {
+-    // Run assembly reshape
+-    if (_asm_glue.is_configured() && _reshape_b_only_on_first_run)
+-    {
+-      ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
+-
+-      _asm_glue.prepare();
+-      _original_b->mark_as_unused();
+-    }
+-    // Run non-assembly reshape
+-    else if (_mtx_b_reshape_kernel && _reshape_b_only_on_first_run)
+-    {
+-      ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
+-
+-      // Run reshape kernel and mark original weights tensor as unused
+-      _tmp_b.allocator()->allocate();
+-      NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
+-      _original_b->mark_as_unused();
+-    }
+-
+-    // Run matrix B reduction kernel only if _a_offset is not equal to 0
+-    if (_a_offset != 0 && _reshape_b_only_on_first_run)
+-    {
+-      _vector_sum_col.allocator()->allocate();
+-      NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);
+-    }
+-
+-    _is_prepared = true;
+-  }
+-}
+diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp
+index c8bb88a..433c35d 100644
+--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp
++++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp
+@@ -41,7 +41,7 @@
+ #include "arm_compute/runtime/NEON/functions/NEGatherEx.h"
+ 
+ #include "arm_compute/core/NEON/kernels/NEGatherKernelEx.h"
+-#include "support/ToolchainSupport.h"
++#include "support/MemorySupport.h"
+ 
+ #include <utility>
+ 
+@@ -49,7 +49,7 @@ namespace arm_compute
+ {
+ void NEGatherEx::configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis)
+ {
+-  auto k = arm_compute::support::cpp14::make_unique<NEGatherKernelEx>();
++  auto k = support::cpp14::make_unique<NEGatherKernelEx>();
+   k->configure(input, indices, output, axis);
+   _kernel = std::move(k);
+ }
+diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp
+index 078019f..52d58ac 100644
+--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp
++++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp
+@@ -41,14 +41,14 @@
+ #include "arm_compute/runtime/NEON/functions/NEHashtableLookup.h"
+ 
+ #include "arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h"
+-#include "support/ToolchainSupport.h"
++#include "support/MemorySupport.h"
+ 
+ using namespace arm_compute;
+ 
+ void NEHashtableLookup::configure(const ITensor *lookups, const ITensor *keys, const ITensor *input,
+                                   ITensor *output, ITensor *hits)
+ {
+-  auto k = arm_compute::support::cpp14::make_unique<NEHashtableLookupKernel>();
++  auto k = support::cpp14::make_unique<NEHashtableLookupKernel>();
+   k->configure(lookups, keys, input, output, hits);
+   _kernel = std::move(k);
+ }
+diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEPReLU.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEPReLU.cpp
+deleted file mode 100644
+index dac3b84..0000000
+--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEPReLU.cpp
++++ /dev/null
+@@ -1,55 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2018-2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/runtime/NEON/functions/NEPReLU.h"
+-
+-#include "arm_compute/core/NEON/kernels/NEPReLUKernel.h"
+-#include "support/ToolchainSupport.h"
+-
+-#include <utility>
+-
+-using namespace arm_compute;
+-
+-void NEPReLU::configure(const ITensor *input, const ITensor *alpha, ITensor *output)
+-{
+-  auto k = arm_compute::support::cpp14::make_unique<NEPReLUKernel>();
+-  k->configure(input, alpha, output);
+-  _kernel = std::move(k);
+-}
+diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NERNNLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NERNNLayerEx.cpp
+deleted file mode 100644
+index 0e9a5e9..0000000
+--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NERNNLayerEx.cpp
++++ /dev/null
+@@ -1,161 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2018-2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/runtime/NEON/functions/NERNNLayerEx.h"
+-
+-#include "arm_compute/core/Error.h"
+-#include "arm_compute/core/TensorInfo.h"
+-#include "arm_compute/core/Types.h"
+-#include "arm_compute/core/Validate.h"
+-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+-#include "arm_compute/runtime/NEON/NEScheduler.h"
+-
+-namespace arm_compute
+-{
+-NERNNLayerEx::NERNNLayerEx(std::shared_ptr<IMemoryManager> memory_manager)
+-    : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(),
+-      _activation_kernel(), _fully_connected_kernel(), _copy_kernel(), _fully_connected_out(),
+-      _gemm_output(), _add_output(), _is_prepared(false)
+-{
+-}
+-
+-Status NERNNLayerEx::validate(const ITensorInfo *input, const ITensorInfo *weights,
+-                              const ITensorInfo *recurrent_weights, const ITensorInfo *bias,
+-                              const ITensorInfo *hidden_state, const ITensorInfo *output,
+-                              const ActivationLayerInfo &info)
+-{
+-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state,
+-                                      output);
+-
+-  const int idx_width = 0;
+-  const int idx_height = 1;
+-  ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_width) != weights->dimension(idx_width));
+-  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_height) !=
+-                              recurrent_weights->dimension(idx_width));
+-  ARM_COMPUTE_RETURN_ERROR_ON(recurrent_weights->dimension(idx_width) !=
+-                              recurrent_weights->dimension(idx_height));
+-  ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() != 1);
+-  ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(idx_width) != weights->dimension(idx_height));
+-  ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_width) != weights->dimension(idx_height));
+-  ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_height) != input->dimension(idx_height));
+-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(),
+-                                                     hidden_state->tensor_shape());
+-
+-  auto shape_info = TensorInfo(misc::shape_calculator::compute_rnn_shape(
+-                                   recurrent_weights, hidden_state->dimension(idx_height)),
+-                               1, input->data_type());
+-
+-  ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, weights, bias, &shape_info));
+-  ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(
+-      &shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
+-  ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayerKernel::validate(&shape_info, &shape_info, info));
+-
+-  return Status{};
+-}
+-
+-void NERNNLayerEx::configure(const ITensor *input, const ITensor *weights,
+-                             const ITensor *recurrent_weights, const ITensor *bias,
+-                             ITensor *hidden_state, ITensor *output, ActivationLayerInfo &info)
+-{
+-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
+-  ARM_COMPUTE_ERROR_THROW_ON(NERNNLayerEx::validate(input->info(), weights->info(),
+-                                                    recurrent_weights->info(), bias->info(),
+-                                                    hidden_state->info(), output->info(), info));
+-
+-  const int idx_height = 1;
+-  TensorShape shape = misc::shape_calculator::compute_rnn_shape(
+-      recurrent_weights->info(), hidden_state->info()->dimension(idx_height));
+-
+-  _is_prepared = false;
+-
+-  // Manage intermediate buffers and configure
+-  _fully_connected_out.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
+-  _gemm_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
+-
+-  // Manage intermediate buffers and configure
+-  _memory_group.manage(&_fully_connected_out);
+-  _fully_connected_kernel.configure(input, weights, bias, &_fully_connected_out);
+-
+-  _memory_group.manage(&_gemm_output);
+-  _gemm_state_f.configure(hidden_state, recurrent_weights, nullptr, &_gemm_output, 1.f, 0.f);
+-
+-  _add_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
+-  _memory_group.manage(&_add_output);
+-
+-  _add_kernel.configure(&_fully_connected_out, &_gemm_output, &_add_output,
+-                        ConvertPolicy::SATURATE);
+-
+-  _fully_connected_out.allocator()->allocate();
+-  _gemm_output.allocator()->allocate();
+-
+-  _activation_kernel.configure(&_add_output, hidden_state, info);
+-  _add_output.allocator()->allocate();
+-
+-  _copy_kernel.configure(hidden_state, output);
+-}
+-
+-void NERNNLayerEx::run()
+-{
+-  prepare();
+-
+-  MemoryGroupResourceScope scope_mg(_memory_group);
+-
+-  _fully_connected_kernel.run();
+-
+-  _gemm_state_f.run();
+-
+-  NEScheduler::get().schedule(&_add_kernel, Window::DimY);
+-  NEScheduler::get().schedule(&_activation_kernel, Window::DimY);
+-
+-  // copy hidden out to output
+-  NEScheduler::get().schedule(&_copy_kernel, Window::DimY);
+-}
+-
+-void NERNNLayerEx::prepare()
+-{
+-  if (!_is_prepared)
+-  {
+-    _fully_connected_kernel.prepare();
+-    _gemm_state_f.prepare();
+-
+-    _is_prepared = true;
+-  }
+-}
+-} // namespace arm_compute
+diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceMeanEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceMeanEx.cpp
+deleted file mode 100644
+index 116bba3..0000000
+--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceMeanEx.cpp
++++ /dev/null
+@@ -1,180 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2018-2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/runtime/NEON/functions/NEReduceMeanEx.h"
+-
+-#include "arm_compute/core/Helpers.h"
+-#include "arm_compute/runtime/NEON/NEScheduler.h"
+-
+-using namespace arm_compute;
+-
+-NEReduceMeanEx::NEReduceMeanEx(std::shared_ptr<IMemoryManager> memory_manager)
+-    : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(),
+-      _reduction_ops(), _keep_dims()
+-{
+-}
+-
+-Status NEReduceMeanEx::validate(const ITensorInfo *input, const Coordinates &reduction_axis,
+-                                bool keep_dims, const ITensorInfo *output)
+-{
+-  ARM_COMPUTE_UNUSED(keep_dims);
+-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+-  ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
+-
+-  TensorShape out_shape = input->tensor_shape();
+-  const unsigned int reduction_ops = reduction_axis.num_dimensions();
+-  const int input_dims = input->num_dimensions();
+-  Coordinates axis_local = reduction_axis;
+-
+-  // Convert negative axis
+-  for (unsigned int i = 0; i < reduction_ops; ++i)
+-  {
+-    axis_local[i] = wrap_around(axis_local[i], input_dims);
+-  }
+-
+-  std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
+-  for (unsigned int i = 0; i < reduction_ops; ++i)
+-  {
+-    ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3);
+-    ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) >
+-                                input->num_dimensions() - 1);
+-    if (output->total_size() > 0 && keep_dims)
+-    {
+-      ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1);
+-    }
+-    if (keep_dims)
+-    {
+-      out_shape.set(axis_local[i], 1);
+-    }
+-    else
+-    {
+-      out_shape.remove_dimension(axis_local[i] - i);
+-    }
+-  }
+-  const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
+-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
+-
+-  return Status{};
+-}
+-
+-void NEReduceMeanEx::configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims,
+-                               ITensor *output)
+-{
+-  ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+-
+-  _reduction_ops = reduction_axis.num_dimensions();
+-  _reduction_kernels =
+-      arm_compute::support::cpp14::make_unique<NEReductionOperation[]>(_reduction_ops);
+-  _reduced_outs =
+-      arm_compute::support::cpp14::make_unique<Tensor[]>(_reduction_ops - (keep_dims ? 1 : 0));
+-  _keep_dims = keep_dims;
+-
+-  Coordinates axis_local = reduction_axis;
+-  const int input_dims = input->info()->num_dimensions();
+-  const unsigned int reduction_ops = reduction_axis.num_dimensions();
+-
+-  // Convert negative axis
+-  for (unsigned int i = 0; i < reduction_ops; ++i)
+-  {
+-    axis_local[i] = wrap_around(axis_local[i], input_dims);
+-  }
+-
+-  // Perform reduction for every axis
+-  for (unsigned int i = 0; i < _reduction_ops; ++i)
+-  {
+-    TensorShape out_shape = i == 0 ? input->info()->tensor_shape()
+-                                   : (_reduced_outs.get() + i - 1)->info()->tensor_shape();
+-    out_shape.set(axis_local[i], 1);
+-    auto in = (i == 0) ? input : (_reduced_outs.get() + i - 1);
+-
+-    if (i == _reduction_ops - 1 && keep_dims)
+-    {
+-      _reduction_kernels[i].configure(in, output, axis_local[i], ReductionOperation::MEAN_SUM);
+-    }
+-    else
+-    {
+-      _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(),
+-                                                    input->info()->data_type(),
+-                                                    input->info()->quantization_info())
+-                                             .set_data_layout(output->info()->data_layout()));
+-      _memory_group.manage(_reduced_outs.get() + i);
+-      _reduction_kernels[i].configure(in, _reduced_outs.get() + i, axis_local[i],
+-                                      ReductionOperation::MEAN_SUM);
+-    }
+-  }
+-
+-  // Allocate intermediate tensors
+-  for (unsigned int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
+-  {
+-    _reduced_outs[i].allocator()->allocate();
+-  }
+-
+-  // Configure reshape layer if we want to drop the dimensions
+-  if (!keep_dims)
+-  {
+-    TensorShape out_shape = input->info()->tensor_shape();
+-
+-    // We have to sort the reduction axis vectors in order for remove_dimension
+-    // to work properly
+-    std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops);
+-    for (unsigned int i = 0; i < _reduction_ops; ++i)
+-    {
+-      out_shape.remove_dimension(axis_local[i] - i);
+-    }
+-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape));
+-    _reshape.configure(_reduced_outs.get() + _reduction_ops - 1, output);
+-  }
+-}
+-
+-void NEReduceMeanEx::run()
+-{
+-  _memory_group.acquire();
+-
+-  for (unsigned int i = 0; i < _reduction_ops; ++i)
+-  {
+-    _reduction_kernels[i].run();
+-  }
+-
+-  if (!_keep_dims)
+-  {
+-    _reshape.run();
+-  }
+-  _memory_group.release();
+-}
+diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToBatchLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToBatchLayerEx.cpp
+deleted file mode 100644
+index 198bb76..0000000
+--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToBatchLayerEx.cpp
++++ /dev/null
+@@ -1,114 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h"
+-
+-#include "arm_compute/core/Error.h"
+-#include "arm_compute/core/TensorInfo.h"
+-#include "arm_compute/core/Types.h"
+-#include "arm_compute/core/Validate.h"
+-#include "arm_compute/runtime/NEON/NEScheduler.h"
+-
+-namespace arm_compute
+-{
+-NESpaceToBatchLayerEx::NESpaceToBatchLayerEx()
+-    : _space_to_batch_kernel(), _memset_kernel(), _has_padding(false)
+-{
+-}
+-
+-void NESpaceToBatchLayerEx::configure(const ITensor *input, const ITensor *block_shape,
+-                                      const ITensor *paddings, ITensor *output)
+-{
+-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output);
+-
+-  if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
+-  {
+-    _has_padding = true;
+-    _memset_kernel.configure(
+-        output, PixelValue(0, output->info()->data_type(), output->info()->quantization_info()));
+-  }
+-  _space_to_batch_kernel.configure(input, block_shape, paddings, output);
+-}
+-
+-void NESpaceToBatchLayerEx::configure(const ITensor *input, const int block_shape_x,
+-                                      const int block_shape_y, const Size2D &padding_left,
+-                                      const Size2D &padding_right, ITensor *output)
+-{
+-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+-
+-  if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
+-  {
+-    _has_padding = true;
+-    _memset_kernel.configure(
+-        output, PixelValue(0, output->info()->data_type(), output->info()->quantization_info()));
+-  }
+-  _space_to_batch_kernel.configure(input, block_shape_x, block_shape_y, padding_left, padding_right,
+-                                   output);
+-}
+-
+-Status NESpaceToBatchLayerEx::validate(const ITensorInfo *input, const ITensorInfo *block_shape,
+-                                       const ITensorInfo *paddings, const ITensorInfo *output)
+-{
+-  ARM_COMPUTE_RETURN_ON_ERROR(
+-      NESpaceToBatchLayerKernel::validate(input, block_shape, paddings, output));
+-
+-  return Status{};
+-}
+-
+-Status NESpaceToBatchLayerEx::validate(const ITensorInfo *input, const int block_shape_x,
+-                                       const int block_shape_y, const Size2D &padding_left,
+-                                       const Size2D &padding_right, const ITensorInfo *output)
+-{
+-  ARM_COMPUTE_RETURN_ON_ERROR(NESpaceToBatchLayerKernel::validate(
+-      input, block_shape_x, block_shape_y, padding_left, padding_right, output));
+-
+-  return Status{};
+-}
+-
+-void NESpaceToBatchLayerEx::run()
+-{
+-  // Zero out output only if we have paddings
+-  if (_has_padding)
+-  {
+-    NEScheduler::get().schedule(&_memset_kernel, Window::DimY);
+-  }
+-  NEScheduler::get().schedule(&_space_to_batch_kernel, Window::DimY);
+-}
+-} // namespace arm_compute
+diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToDepthLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToDepthLayerEx.cpp
+deleted file mode 100644
+index 97697e3..0000000
+--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToDepthLayerEx.cpp
++++ /dev/null
+@@ -1,64 +0,0 @@
+-/*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2019 ARM Limited.
+- *
+- * SPDX-License-Identifier: MIT
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a copy
+- * of this software and associated documentation files (the "Software"), to
+- * deal in the Software without restriction, including without limitation the
+- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+- * sell copies of the Software, and to permit persons to whom the Software is
+- * furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in all
+- * copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-
+-#include "arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h"
+-
+-#include "arm_compute/core/Error.h"
+-#include "arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h"
+-#include "arm_compute/core/TensorInfo.h"
+-#include "arm_compute/core/Types.h"
+-#include "arm_compute/core/Validate.h"
+-
+-namespace arm_compute
+-{
+-void NESpaceToDepthLayerEx::configure(const ITensor *input, ITensor *output, int32_t block_shape)
+-{
+-  auto k = arm_compute::support::cpp14::make_unique<NESpaceToDepthLayerKernelEx>();
+-  k->configure(input, output, block_shape);
+-  _kernel = std::move(k);
+-}
+-
+-Status NESpaceToDepthLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+-                                       int32_t block_shape)
+-{
+-  ARM_COMPUTE_RETURN_ON_ERROR(NESpaceToDepthLayerKernelEx::validate(input, output, block_shape));
+-  return Status{};
+-}
+-} // namespace arm_compute
+diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp
+index df06892..09f1780 100644
+--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp
++++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp
+@@ -1,21 +1,5 @@
+ /*
+- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-/*
+- * Copyright (c) 2017-2019 ARM Limited.
++ * Copyright (c) 2017-2020 ARM Limited.
+  *
+  * SPDX-License-Identifier: MIT
+  *
+@@ -37,14 +21,11 @@
+  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+  * SOFTWARE.
+  */
+-
+ #include "arm_compute/runtime/NEON/functions/NETransposeConvLayer.h"
+ 
+ #include "arm_compute/core/Helpers.h"
+-#include "arm_compute/core/Utils.h"
+ #include "arm_compute/core/UtilsEx.h"
+ #include "arm_compute/core/Validate.h"
+-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+ #include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
+ #include "arm_compute/runtime/NEON/NEScheduler.h"
+ 
+@@ -52,20 +33,15 @@ using namespace arm_compute::misc::shape_calculator;
+ 
+ namespace arm_compute
+ {
++
+ NETransposeConvLayer::NETransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+     : _memory_group(std::move(memory_manager)),
+       _conv_f(),
+       _upsample_f(),
+       _flip_weights(),
+-      _permute_input(),
+-      _permute_weights(),
+-      _permute_output(),
+       _scaled_output(),
+       _weights_flipped(),
+-      _permuted_input(),
+-      _permuted_weights(),
+-      _permuted_output(),
+-      _is_nchw(false),
++      _flip_axis(),
+       _original_weights(nullptr),
+       _input(nullptr),
+       _info(),
+@@ -80,7 +56,7 @@ Status NETransposeConvLayer::validate(const ITensorInfo *input, const ITensorInf
+ {
+   ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+   ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16,
+-                                                       DataType::QASYMM8);
++                                                       DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+   ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, input);
+   ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(weights, input);
+   const unsigned int width_idx =
+@@ -95,13 +71,16 @@ Status NETransposeConvLayer::validate(const ITensorInfo *input, const ITensorInf
+       weights->dimension(height_idx), info, invalid_right, invalid_bottom);
+ 
+   ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+-  if (is_data_type_quantized_asymmetric(input->data_type()) && bias)
++  if (bias != nullptr)
+   {
+-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+-  }
+-  else if (bias)
+-  {
+-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
++    if (is_data_type_quantized_asymmetric(input->data_type()))
++    {
++      ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
++    }
++    else
++    {
++      ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
++    }
+   }
+ 
+   if (output->tensor_shape().total_size() > 0)
+@@ -110,12 +89,12 @@ Status NETransposeConvLayer::validate(const ITensorInfo *input, const ITensorInf
+ 
+     const TensorShape output_shape = compute_transposeconv_output_shape(out_dims, *input, *weights);
+ 
+-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) < output_shape.x(),
+-                                    "Output's dim 0 is invalid.");
+-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimY) < output_shape.y(),
+-                                    "Output's dim 1 is invalid.");
+-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) < output_shape.z(),
+-                                    "Output's dim 2 is invalid.");
++    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) != output_shape.x(),
++                                    "Output's width is invalid.");
++    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimY) != output_shape.y(),
++                                    "Output's height is invalid.");
++    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) != output_shape.z(),
++                                    "Output's depth is invalid.");
+   }
+ 
+   unsigned int pad_left = 0;
+@@ -127,7 +106,6 @@ Status NETransposeConvLayer::validate(const ITensorInfo *input, const ITensorInf
+       pad_bottom);
+   TensorInfo scale_out_info(
+       input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape));
+-  scale_out_info.set_data_layout(input->data_layout());
+   const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+ 
+   const unsigned int batches_idx =
+@@ -149,19 +127,13 @@ void NETransposeConvLayer::configure(ITensor *input, const ITensor *weights, con
+                                      ITensor *output, const PadStrideInfo &info,
+                                      unsigned int invalid_right, unsigned int invalid_bottom)
+ {
++  // Perform validation step
+   ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
++  ARM_COMPUTE_ERROR_THROW_ON(NETransposeConvLayer::validate(
++      input->info(), weights->info(), (bias == nullptr) ? nullptr : bias->info(), output->info(),
++      info, invalid_right, invalid_bottom));
+ 
+   const DataLayout data_layout = input->info()->data_layout();
+-
+-  _input = input;
+-  _original_weights = weights;
+-  _info = info;
+-  _is_prepared = false;
+-  _is_nchw = data_layout == DataLayout::NCHW;
+-
+-  const unsigned int stride_x = info.stride().first;
+-  const unsigned int stride_y = info.stride().second;
+-
+   const unsigned int width_idx =
+       get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+   const unsigned int height_idx =
+@@ -173,101 +145,54 @@ void NETransposeConvLayer::configure(ITensor *input, const ITensor *weights, con
+ 
+   const TensorShape output_shape =
+       compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info());
++
++  _input = input;
++  _original_weights = weights;
++  _info = info;
++  _is_prepared = false;
++
++  unsigned int pad_left = 0;
++  unsigned int pad_right = 0;
++  unsigned int pad_top = 0;
++  unsigned int pad_bottom = 0;
++  const unsigned int stride_x = info.stride().first;
++  const unsigned int stride_y = info.stride().second;
++
+   // Output auto initialization if not yet initialized
+   auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(),
+                      input->info()->quantization_info());
+ 
+-  // Perform validation step
+-  ARM_COMPUTE_ERROR_THROW_ON(NETransposeConvLayer::validate(
+-      input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(),
+-      info, invalid_right, invalid_bottom));
+-
++  _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32));
+   _memory_group.manage(&_scaled_output);
+ 
+-  if (!_is_nchw)
+-  {
+-    _memory_group.manage(&_permuted_input);
+-    _memory_group.manage(&_permuted_weights);
+-    _memory_group.manage(&_permuted_output);
+-
+-    // Configure the function to transform the input tensor from NHWC -> NCHW
+-    _permuted_input.info()->set_quantization_info(input->info()->quantization_info());
+-    _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
+-    _permuted_input.info()->set_data_layout(DataLayout::NCHW);
+-
+-    // Configure the function to transform the weights tensor from NHWC -> NCHW
+-    _permuted_weights.info()->set_quantization_info(weights->info()->quantization_info());
+-    _permute_weights.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
+-    _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
+-
+-    // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in
+-    // order to match output shape
+-
+-    unsigned int pad_left = 0;
+-    unsigned int pad_right = 0;
+-    unsigned int pad_top = 0;
+-    unsigned int pad_bottom = 0;
+-    const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
+-        *_permuted_input.info(), *_permuted_weights.info(), info, out_dims, invalid_right,
+-        invalid_bottom, pad_left, pad_right, pad_top, pad_bottom);
+-
+-    TensorInfo scale_out_info(scale_out_shape, 1, _permuted_input.info()->data_type(),
+-                              _permuted_input.info()->quantization_info());
+-    scale_out_info.set_data_layout(DataLayout::NCHW);
+-    _scaled_output.allocator()->init(scale_out_info);
+-
+-    const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom,
+-                                      DimensionRoundingType::CEIL);
+-    _upsample_f.configure(&_permuted_input, &_scaled_output, upsample_info);
+-
+-    _weights_flipped.allocator()->init(*_permuted_weights.info()->clone());
+-    _weights_flipped.info()->set_quantization_info(weights->info()->quantization_info());
+-    _flip_weights.configure(&_permuted_weights, &_weights_flipped);
+-
+-    // setup the function to convolve the upscaled output
+-    const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+-
+-    const auto out_shape = output->info()->tensor_shape();
+-    TensorShape permuted_out_shape{out_shape[1], out_shape[2], out_shape[0], out_shape[3]};
+-    TensorInfo permuted_out_info(permuted_out_shape, 1, output->info()->data_type(),
+-                                 output->info()->quantization_info());
+-    _permuted_output.allocator()->init(permuted_out_info);
+-    _permuted_output.info()->set_data_layout(DataLayout::NCHW);
+-    _conv_f.configure(&_scaled_output, &_weights_flipped, bias, &_permuted_output, conv_info);
+-
+-    // Configure the function to transform the convoluted output to NHWC
+-    _permute_output.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U));
+-
+-    _permuted_input.allocator()->allocate();
+-    _permuted_weights.allocator()->allocate();
+-    _permuted_output.allocator()->allocate();
+-  }
+-  else
+-  {
+-    // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in
+-    // order to match output shape
+-    unsigned int pad_left = 0;
+-    unsigned int pad_right = 0;
+-    unsigned int pad_top = 0;
+-    unsigned int pad_bottom = 0;
+-    const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
+-        *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left,
+-        pad_right, pad_top, pad_bottom);
+-
+-    TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(),
+-                              input->info()->quantization_info());
+-    _scaled_output.allocator()->init(scale_out_info);
+-    const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom,
+-                                      DimensionRoundingType::FLOOR);
+-    _upsample_f.configure(input, &_scaled_output, upsample_info);
+-
+-    _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
+-    _flip_weights.configure(weights, &_weights_flipped);
+-
+-    // setup the function to convolve the upscaled output
+-    const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+-    _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info);
+-  }
++  _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
++  _flip_weights.configure(weights, &_weights_flipped, &_flip_axis);
++
++  // setup the function to convolve the upscaled output
++  const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
++
++  const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
++      *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left,
++      pad_right, pad_top, pad_bottom);
++
++  const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom,
++                                    DimensionRoundingType::FLOOR);
++
++  TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(),
++                            input->info()->quantization_info());
++  scale_out_info.set_data_layout(data_layout);
++  _scaled_output.allocator()->init(scale_out_info);
++
++  _upsample_f.configure(input, &_scaled_output, upsample_info);
++
++  _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info);
++
++  // Setup flip axis data
++  _flip_axis.allocator()->allocate();
++  auto axis_data = reinterpret_cast<uint32_t *>(_flip_axis.buffer());
++  axis_data[0] = static_cast<uint32_t>(width_idx);
++  axis_data[1] = static_cast<uint32_t>(height_idx);
++
+   _scaled_output.allocator()->allocate();
+ }
+ 
+@@ -275,22 +200,10 @@ void NETransposeConvLayer::run()
+ {
+   prepare();
+ 
+-  // MemoryGroupResourceScope scope_mg(_memory_group);
+-
+-  // Permute input
+-  if (!_is_nchw)
+-  {
+-    _permute_input.run();
+-  }
++  MemoryGroupResourceScope scope_mg(_memory_group);
+ 
+   _upsample_f.run();
+   _conv_f.run();
+-
+-  // Permute output
+-  if (!_is_nchw)
+-  {
+-    _permute_output.run();
+-  }
+ }
+ 
+ void NETransposeConvLayer::prepare()
+@@ -301,22 +214,12 @@ void NETransposeConvLayer::prepare()
+ 
+     // Run weights flipping and mark original weights tensor as unused
+     _weights_flipped.allocator()->allocate();
+-    // Permute weights
+-    if (!_is_nchw)
+-    {
+-      _permute_weights.run();
+-    }
+-    NEScheduler::get().schedule(&_flip_weights, Window::DimZ);
++    _flip_weights.run();
+     _original_weights->mark_as_unused();
+ 
+     // Prepare convolution
+     _conv_f.prepare();
+ 
+-    if (!_weights_flipped.is_used())
+-    {
+-      _weights_flipped.allocator()->free();
+-    }
+-
+     _is_prepared = true;
+   }
+ }
+diff --git a/compute/cker/CMakeLists.txt b/compute/cker/CMakeLists.txt
+index 09f6725..609dd45 100644
+--- a/compute/cker/CMakeLists.txt
++++ b/compute/cker/CMakeLists.txt
+@@ -8,6 +8,9 @@ target_link_libraries(nnfw_lib_cker INTERFACE gemmlowp)
+ target_link_libraries(nnfw_lib_cker INTERFACE ruy)
+ target_link_libraries(nnfw_lib_cker INTERFACE ruy_instrumentation)
+ target_compile_definitions(nnfw_lib_cker INTERFACE USE_RUY_GEMV)
++if(EXPERIMENTAL_RUY_FEATURE)
++  target_compile_definitions(nnfw_lib_cker INTERFACE EXPERIMENTAL_RUY_FEATURE)
++endif(EXPERIMENTAL_RUY_FEATURE)
+ if(PROFILE_RUY)
+   target_link_libraries(nnfw_lib_cker INTERFACE ruy_profiler)
+ endif(PROFILE_RUY)
+diff --git a/compute/cker/include/cker/Types.h b/compute/cker/include/cker/Types.h
+index 41b1916..1bde640 100644
+--- a/compute/cker/include/cker/Types.h
++++ b/compute/cker/include/cker/Types.h
+@@ -259,6 +259,12 @@ struct FullyConnectedParams
+   // FullyConnectedWeightsFormat weights_format;
+ };
+ 
++struct L2NormParams
++{
++  // uint8 inference params.
++  int32_t input_zero_point;
++};
++
+ struct GatherParams
+ {
+   int32_t axis;
+@@ -338,6 +344,11 @@ struct SpaceToBatchParams
+   int32_t output_offset;
+ };
+ 
++struct SpaceToDepthParams
++{
++  int32_t block_size;
++};
++
+ enum class Order
+ {
+   kColMajor,
+diff --git a/compute/cker/include/cker/Utils.h b/compute/cker/include/cker/Utils.h
+index b69d55c..2abb998 100644
+--- a/compute/cker/include/cker/Utils.h
++++ b/compute/cker/include/cker/Utils.h
+@@ -123,6 +123,68 @@ inline int CountLeadingZeros(uint32_t integer_input)
+   return leading_zeros;
+ }
+ 
++inline void GetInvSqrtQuantizedMultiplierExp(int32_t input, int reverse_shift,
++                                             int32_t *output_inv_sqrt, int *output_shift)
++{
++  assert(input >= 0);
++  if (input <= 1)
++  {
++    // Handle the input value 1 separately to avoid overflow in that case
++    // in the general computation below (b/143972021). Also handle 0 as if it
++    // were a 1. 0 is an invalid input here (divide by zero) and 1 is a valid
++    // but rare/unrealistic input value. We can expect both to occur in some
++    // incompletely trained models, but probably not in fully trained models.
++    *output_inv_sqrt = std::numeric_limits<std::int32_t>::max();
++    *output_shift = 0;
++    return;
++  }
++  assert(input > 1);
++  *output_shift = 11;
++  while (input >= (1 << 29))
++  {
++    input /= 4;
++    ++*output_shift;
++  }
++  const unsigned max_left_shift_bits = CountLeadingZeros(static_cast<uint32_t>(input)) - 1;
++  const unsigned max_left_shift_bit_pairs = max_left_shift_bits / 2;
++  const unsigned left_shift_bit_pairs = max_left_shift_bit_pairs - 1;
++  *output_shift -= left_shift_bit_pairs;
++  input <<= 2 * left_shift_bit_pairs;
++  assert(input >= (1 << 27));
++  assert(input < (1 << 29));
++  using gemmlowp::FixedPoint;
++  using gemmlowp::Rescale;
++  using gemmlowp::SaturatingRoundingMultiplyByPOT;
++  // Using 3 integer bits gives us enough room for the internal arithmetic in
++  // this Newton-Raphson iteration.
++  using F3 = FixedPoint<int32_t, 3>;
++  using F0 = FixedPoint<int32_t, 0>;
++  const F3 fixedpoint_input = F3::FromRaw(input >> 1);
++  const F3 fixedpoint_half_input = SaturatingRoundingMultiplyByPOT<-1>(fixedpoint_input);
++  const F3 fixedpoint_half_three =
++      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F3, (1 << 28) + (1 << 27), 1.5);
++  // Newton-Raphson iteration
++  // Naive unoptimized starting guess: x = 1
++  F3 x = F3::One();
++  // Naive unoptimized number of iterations: 5
++  for (int i = 0; i < 5; i++)
++  {
++    const F3 x3 = Rescale<3>(x * x * x);
++    x = Rescale<3>(fixedpoint_half_three * x - fixedpoint_half_input * x3);
++  }
++  const F0 fixedpoint_half_sqrt_2 =
++      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F0, 1518500250, std::sqrt(2.) / 2.);
++  x = x * fixedpoint_half_sqrt_2;
++  *output_inv_sqrt = x.raw();
++  if (*output_shift < 0)
++  {
++    *output_inv_sqrt <<= -*output_shift;
++    *output_shift = 0;
++  }
++  // Convert right shift (right is positive) to left shift.
++  *output_shift *= reverse_shift;
++}
++
+ // Comment from tensorflow lite:
+ //
+ // DO NOT USE THIS STRUCT FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING
+diff --git a/compute/cker/include/cker/operation/FullyConnected.h b/compute/cker/include/cker/operation/FullyConnected.h
+index 9bcf3fd..9b72811 100644
+--- a/compute/cker/include/cker/operation/FullyConnected.h
++++ b/compute/cker/include/cker/operation/FullyConnected.h
+@@ -78,8 +78,11 @@ inline void FullyConnected(const FullyConnectedParams &params, const Shape &inpu
+   MatrixBatchVectorMultiplyAccumulate(weights_data, num_units, input_size, input_data, batch_size,
+                                       output_data, /*result_stride=*/1);
+ 
+-  // Apply activation function
+-  ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data);
++  if (params.activation != FusedActivationFunctionType::kNone)
++  {
++    // Apply activation function
++    ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data);
++  }
+ }
+ 
+ inline void FullyConnected(const FullyConnectedParams &params, const Shape &input_shape,
+@@ -195,7 +198,11 @@ inline void FullyConnectedHybrid(const FullyConnectedParams &params, const Shape
+ #endif
+ 
+   // Apply activation function to floats.
+-  ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data);
++  if (params.activation != FusedActivationFunctionType::kNone)
++  {
++    // Apply activation function
++    ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data);
++  }
+   return;
+ }
+ 
+diff --git a/compute/cker/include/cker/operation/L2Normalize.h b/compute/cker/include/cker/operation/L2Normalize.h
+new file mode 100644
+index 0000000..a0075c3
+--- /dev/null
++++ b/compute/cker/include/cker/operation/L2Normalize.h
+@@ -0,0 +1,94 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *      http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#ifndef __NNFW_CKER_L2NORMALIZE_H__
++#define __NNFW_CKER_L2NORMALIZE_H__
++
++#include "cker/Shape.h"
++#include "cker/Utils.h"
++#include "cker/Types.h"
++
++namespace nnfw
++{
++namespace cker
++{
++
++void L2NormalizeFloat32(const Shape &input_shape, const float *input_data,
++                        const Shape &output_shape, float *output_data)
++{
++  float epsilon = 1e-6;
++  const int trailing_dim = input_shape.DimensionsCount() - 1;
++  const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
++  const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
++  for (int i = 0; i < outer_size; ++i)
++  {
++    float squared_l2_norm = 0;
++    for (int c = 0; c < depth; ++c)
++    {
++      const float val = input_data[c];
++      squared_l2_norm += val * val;
++    }
++    float l2_norm = std::sqrt(squared_l2_norm);
++    l2_norm = std::max(l2_norm, epsilon);
++    for (int c = 0; c < depth; ++c)
++    {
++      *output_data = *input_data / l2_norm;
++      ++output_data;
++      ++input_data;
++    }
++  }
++}
++
++void L2NormalizeQuant8(L2NormParams &params, const Shape &input_shape, const uint8_t *input_data,
++                       const Shape &output_shape, uint8_t *output_data)
++{
++  const int trailing_dim = input_shape.DimensionsCount() - 1;
++  const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
++  const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
++  const int32_t input_zero_point = params.input_zero_point;
++
++  for (int i = 0; i < outer_size; ++i)
++  {
++    int32_t square_l2_norm = 0;
++    for (int c = 0; c < depth; c++)
++    {
++      // Note that input_data advances by depth in the second pass below.
++      int32_t diff = input_data[c] - input_zero_point;
++      square_l2_norm += diff * diff;
++    }
++    int32_t inv_l2norm_multiplier;
++    int inv_l2norm_shift;
++    GetInvSqrtQuantizedMultiplierExp(square_l2_norm, -1, &inv_l2norm_multiplier, &inv_l2norm_shift);
++    for (int c = 0; c < depth; c++)
++    {
++      int32_t diff = *input_data - input_zero_point;
++      int32_t rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOneExp(
++          128 * diff, inv_l2norm_multiplier, inv_l2norm_shift);
++      int32_t unclamped_output_val = 128 + rescaled_diff;
++      int32_t output_val = std::min(static_cast<int32_t>(255),
++                                    std::max(static_cast<int32_t>(0), unclamped_output_val));
++      *output_data = static_cast<uint8_t>(output_val);
++      ++input_data;
++      ++output_data;
++    }
++  }
++}
++
++} // namespace cker
++} // namespace nnfw
++
++#endif // __NNFW_CKER_L2NORMALIZE_H__
+diff --git a/compute/cker/include/cker/operation/Logistic.h b/compute/cker/include/cker/operation/Logistic.h
+index 7477858..3d3e59e 100644
+--- a/compute/cker/include/cker/operation/Logistic.h
++++ b/compute/cker/include/cker/operation/Logistic.h
+@@ -32,18 +32,9 @@ namespace cker
+ inline void Logistic(const Shape &input_shape, const float *input_data, const Shape &output_shape,
+                      float *output_data)
+ {
+-#ifdef __aarch64__
+   auto input_map = MapAsVector(input_data, input_shape);
+   auto output_map = MapAsVector(output_data, output_shape);
+   output_map.array() = input_map.array().unaryExpr(Eigen::internal::scalar_logistic_op<float>());
+-#else
+-  // Note, this can be done using TANH: (1/2) + (1/2) * TANH(x/2)
+-  const int size = MatchingFlatSize(input_shape, output_shape);
+-  for (int i = 0; i < size; i++)
+-  {
+-    output_data[i] = 1.f / (1.f + std::exp(-input_data[i]));
+-  }
+-#endif
+ }
+ 
+ } // namespace cker
+diff --git a/compute/cker/include/cker/operation/Pad.h b/compute/cker/include/cker/operation/Pad.h
+index af432f3..4a2732d 100644
+--- a/compute/cker/include/cker/operation/Pad.h
++++ b/compute/cker/include/cker/operation/Pad.h
+@@ -26,9 +26,10 @@ namespace nnfw
+ {
+ namespace cker
+ {
++template <typename T>
+ inline void Pad(const int32_t *padding_data, int32_t pad_rank, const Shape &input_shape,
+-                const float *input_data, const Shape &output_shape, float *output_data,
+-                const float *constant_value_data)
++                const T *input_data, const Shape &output_shape, T *output_data,
++                const T *constant_value_data)
+ {
+   // Note, this is pad with mode=`CONSTANT`: it doesn't support `REFLECT` and `SYMMETRIC`
+   // TODO: come up with more subtle solution that uses subtensors like arm compute
+@@ -38,7 +39,7 @@ inline void Pad(const int32_t *padding_data, int32_t pad_rank, const Shape &inpu
+   /** List of padding information */
+   using PaddingList = std::vector<PaddingInfo>;
+ 
+-  auto constant_value = constant_value_data ? *constant_value_data : 0;
++  const T constant_value = constant_value_data ? *constant_value_data : 0;
+   assert(output_shape.DimensionsCount() == input_shape.DimensionsCount());
+ 
+   PaddingList padding_list(pad_rank);
+@@ -64,7 +65,7 @@ inline void Pad(const int32_t *padding_data, int32_t pad_rank, const Shape &inpu
+     {
+       const int32_t in_row_len = input_shape.Dims(0);
+       std::fill_n(output_data, padding_list[0].first, constant_value);
+-      std::memcpy(output_data + padding_list[0].first, input_data, in_row_len * sizeof(float));
++      std::memcpy(output_data + padding_list[0].first, input_data, in_row_len * sizeof(T));
+       std::fill_n(output_data + padding_list[0].first + in_row_len, padding_list[0].second,
+                   constant_value);
+       break;
+@@ -89,7 +90,7 @@ inline void Pad(const int32_t *padding_data, int32_t pad_rank, const Shape &inpu
+         out_offset += padding_list[1].first;
+ 
+         // copy a row of input data
+-        memcpy(output_data + out_offset, input_data + in_offset, in_row_len * sizeof(float));
++        memcpy(output_data + out_offset, input_data + in_offset, in_row_len * sizeof(T));
+ 
+         out_offset += in_row_len;
+ 
+@@ -132,7 +133,7 @@ inline void Pad(const int32_t *padding_data, int32_t pad_rank, const Shape &inpu
+           out_offset += padding_list[2].first;
+ 
+           // copy a row of input data
+-          memcpy(output_data + out_offset, input_data + in_offset, in_row_len * sizeof(float));
++          memcpy(output_data + out_offset, input_data + in_offset, in_row_len * sizeof(T));
+ 
+           out_offset += in_row_len;
+ 
+@@ -191,7 +192,7 @@ inline void Pad(const int32_t *padding_data, int32_t pad_rank, const Shape &inpu
+             out_c_offset += padding_list[3].first;
+ 
+             // copy a row of input data
+-            memcpy(output_data + out_c_offset, input_data + in_offset, in_row_len * sizeof(float));
++            memcpy(output_data + out_c_offset, input_data + in_offset, in_row_len * sizeof(T));
+ 
+             out_c_offset += in_row_len;
+ 
+diff --git a/compute/cker/include/cker/operation/Quantize.h b/compute/cker/include/cker/operation/Quantize.h
+new file mode 100644
+index 0000000..5c82d11
+--- /dev/null
++++ b/compute/cker/include/cker/operation/Quantize.h
+@@ -0,0 +1,47 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *      http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#ifndef __NNFW_CKER_QUANTIZE_H__
++#define __NNFW_CKER_QUANTIZE_H__
++
++#include "cker/Shape.h"
++#include "cker/Types.h"
++#include "cker/Utils.h"
++#include <stdexcept>
++#include <iostream>
++namespace nnfw
++{
++namespace cker
++{
++template <typename InputT, typename OutputT>
++inline void Quantize(const Shape &input_shape, const InputT *input_data, const Shape &output_shape,
++                     OutputT *output_data, const float output_scale, const int32_t output_offset)
++{
++  const int flat_size = MatchingFlatSize(input_shape, output_shape);
++  int min_val = std::numeric_limits<OutputT>::min();
++  int max_val = std::numeric_limits<OutputT>::max();
++
++  for (int i = 0; i < flat_size; i++)
++  {
++    int32_t unclamped = static_cast<int32_t>(round(input_data[i] / output_scale)) + output_offset;
++    int32_t clamped = std::min(std::max(unclamped, min_val), max_val);
++    output_data[i] = clamped;
++  }
++}
++} // namespace cker
++} // namespace nnfw
++
++#endif // __NNFW_CKER_QUANTIZE_H__
+diff --git a/compute/cker/include/cker/operation/SpaceToDepth.h b/compute/cker/include/cker/operation/SpaceToDepth.h
+new file mode 100644
+index 0000000..ef67931
+--- /dev/null
++++ b/compute/cker/include/cker/operation/SpaceToDepth.h
+@@ -0,0 +1,71 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *      http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#ifndef __NNFW_CKER_SPACE_TO_DEPTH_H__
++#define __NNFW_CKER_SPACE_TO_DEPTH_H__
++
++#include "cker/Shape.h"
++#include "cker/Types.h"
++
++namespace nnfw
++{
++namespace cker
++{
++
++template <typename T>
++inline void SpaceToDepth(const SpaceToDepthParams &params, const Shape &unextended_input_shape,
++                         const T *input_data, const Shape &unextended_output_shape, T *output_data)
++{
++  assert(unextended_input_shape.DimensionsCount() <= 4);
++  assert(unextended_output_shape.DimensionsCount() <= 4);
++  const Shape input_shape = Shape::ExtendedShape(4, unextended_input_shape);
++  const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape);
++
++  const int output_depth = output_shape.Dims(3);
++  const int output_width = output_shape.Dims(2);
++  const int output_height = output_shape.Dims(1);
++
++  const int input_depth = input_shape.Dims(3);
++  const int batch_size = input_shape.Dims(0);
++
++  // Number of continuous values that we can copy in one interation.
++  const int stride = params.block_size * input_depth;
++
++  for (int batch = 0; batch < batch_size; ++batch)
++  {
++    for (int out_h = 0; out_h < output_height; ++out_h)
++    {
++      T *output_ptr = output_data + Offset(output_shape, batch, out_h, 0, 0);
++      for (int offset_h = 0; offset_h < params.block_size; ++offset_h)
++      {
++        T *dst = output_ptr;
++        for (int out_w = 0; out_w < output_width; ++out_w)
++        {
++          memcpy(dst, input_data, stride * sizeof(T));
++          input_data += stride;
++          dst += output_depth;
++        }
++        output_ptr += stride;
++      }
++    }
++  }
++}
++
++} // namespace cker
++} // namespace nnfw
++
++#endif // __NNFW_CKER_SPACE_TO_DEPTH_H__
+diff --git a/compute/cker/include/cker/ruy/RuySupport.h b/compute/cker/include/cker/ruy/RuySupport.h
+index 432b181..080f66f 100644
+--- a/compute/cker/include/cker/ruy/RuySupport.h
++++ b/compute/cker/include/cker/ruy/RuySupport.h
+@@ -24,7 +24,7 @@
+ 
+ namespace
+ {
+-const int kDefaultNumThreadpoolThreads = 4;
++const int kDefaultNumThreadpoolThreads = 1;
+ }
+ 
+ namespace nnfw
+diff --git a/docs/howto/how-to-build-runtime.md b/docs/howto/how-to-build-runtime.md
+index 2bfd14c..657f0f7 100644
+--- a/docs/howto/how-to-build-runtime.md
++++ b/docs/howto/how-to-build-runtime.md
+@@ -13,7 +13,7 @@ In the Ubuntu, you can easily install it with the following command.
+ 
+ ```
+ $ sudo apt-get install cmake libboost-all-dev
+-``` 
++```
+ 
+ If your linux system does not have the basic development configuration, you will need to install more packages. A list of all packages needed to configure the development environment can be found in the https://github.com/Samsung/ONE/blob/master/infra/docker/Dockerfile.1804 file.
+ 
+@@ -44,7 +44,7 @@ python3-venv \
+ scons \
+ software-properties-common \
+ unzip \
+-wget 
++wget
+ 
+ $ mkdir /tmp/gtest
+ $ cd /tmp/gtest
+@@ -63,7 +63,7 @@ In a typical linux development environment, including Ubuntu, you can build the
+ ```
+ $ git clone https://github.com/Samsung/ONE.git one
+ $ cd one
+-$ cp -n Makefile.template Makefile; make install
++$ make -f Makefile.template install
+ ```
+ 
+ Unfortunately, the debug build on the x86_64 architecture currently has an error. To solve the problem, you must use gcc version 9 or higher. Another workaround is to do a release build rather than a debug build. This is not a suitable method for debugging during development, but it is enough to check the function of the runtime. To release build the runtime, add the environment variable `BUILD_TYPE=release` to the build command as follows.
+diff --git a/docs/nnfw/howto/CrossBuildForAndroid.md b/docs/nnfw/howto/CrossBuildForAndroid.md
+index d7e48c8..08d5fd6 100644
+--- a/docs/nnfw/howto/CrossBuildForAndroid.md
++++ b/docs/nnfw/howto/CrossBuildForAndroid.md
+@@ -44,11 +44,9 @@ Different from cross build for linux,
+ Here is an example of using Makefile.
+ 
+ ```bash
+-cp -n Makefile.template Makefile
+-
+ TARGET_OS=android \
+ CROSS_BUILD=1 \
+ NDK_DIR=/path/android-tools/r20/ndk \
+ EXT_ACL_FOLDER=/path/arm_compute-v19.11.1-bin-android/lib/android-arm64-v8a-neon-cl \
+-make install
++make -f Makefile.template install
+ ```
+diff --git a/docs/runtime/core.md b/docs/runtime/core.md
+index 42ba75f..64a6c62 100644
+--- a/docs/runtime/core.md
++++ b/docs/runtime/core.md
+@@ -68,7 +68,7 @@ Let's say we have some functions written in a certain programming language. Then
+ 
+ With generated tensors and kernels, the compiler creates executor objects. There are 3 types of executors are supported - Linear, Dataflow, and Parallel. Linear executor is the default executor and Dataflow Executor and Parallel Executor are experimental.
+ 
+-For more about executors, please refer to [Executors](./executors.md) document.
++For more about executors, please refer to [Executors](executors.md) document.
+ 
+ ### Module `exec`
+ 
+@@ -83,4 +83,4 @@ For more about executors, please refer to [Executors](./executors.md) document.
+ 
+ Backends are plugins and they are loaded dynamically(via `dlopen`). So this module is a set of interface classes for backend implementation. `compiler` can compile with a variety of backends without knowing specific backend implementation.
+ 
+-Backend interface classes are mostly about memory management and kernel generation. For more, please refer to [Backend API](./backend-api.md) document.
++Backend interface classes are mostly about memory management and kernel generation. For more, please refer to [Backend API](backend-api.md) document.
+diff --git a/docs/runtime/heterogeneous-execution.md b/docs/runtime/heterogeneous-execution.md
+index dc39dae..e7a5e27 100644
+--- a/docs/runtime/heterogeneous-execution.md
++++ b/docs/runtime/heterogeneous-execution.md
+@@ -12,11 +12,11 @@ Here is another case. Let's say we have a model that is not sequential so there
+ 
+ ![Add-3Conv model](heterogeneous-execution-add-3-conv-model.png)
+ 
+-Say we have 3 backends that are based on CPU, GPU and NPU(Neural Processing Unit) respectively. After executing Add, 3 Conv2D operations are ready to run. We may utilize those backends with [Parallel Executor (experimental)](./executors.md#parallel-executor-experimental). For this case we may get performance gain regardless of kernels' speed as those are run in parallel independently.
++Say we have 3 backends that are based on CPU, GPU and NPU(Neural Processing Unit) respectively. After executing Add, 3 Conv2D operations are ready to run. We may utilize those backends with [Parallel Executor (experimental)](executors.md#parallel-executor-experimental). For this case we may get performance gain regardless of kernels' speed as those are run in parallel independently.
+ 
+ ## Graph Transformation
+ 
+-Unfortunately it is not that simple to get performance gain. As each backend has its own memory management module, a copy must be done between backend boundaries. Plus, it may require layout changes so "Permute" operations are added from  `PermutationInsertionPass`. This process is done from [Lowering](./core.md#1-lowering) phase of compilation.
++Unfortunately it is not that simple to get performance gain. As each backend has its own memory management module, a copy must be done between backend boundaries. Plus, it may require layout changes so "Permute" operations are added from  `PermutationInsertionPass`. This process is done from [Lowering](core.md#1-lowering) phase of compilation.
+ 
+ Here is an example of that. Let's say we have assigned different backends for Add and Conv2D. So a Permute operation is inserted between them.
+ 
+diff --git a/infra/cmake/packages/ARMComputeSourceConfig.cmake b/infra/cmake/packages/ARMComputeSourceConfig.cmake
+index 51a235a..adec1f9 100644
+--- a/infra/cmake/packages/ARMComputeSourceConfig.cmake
++++ b/infra/cmake/packages/ARMComputeSourceConfig.cmake
+@@ -8,7 +8,7 @@ function(_ARMComputeSource_import)
+   nnas_include(OptionTools)
+ 
+   envoption(EXTERNAL_DOWNLOAD_SERVER "https://github.com")
+-  set(ARMCOMPUTE_URL ${EXTERNAL_DOWNLOAD_SERVER}/ARM-software/ComputeLibrary/archive/v19.11.1.tar.gz)
++  set(ARMCOMPUTE_URL ${EXTERNAL_DOWNLOAD_SERVER}/ARM-software/ComputeLibrary/archive/v20.05.tar.gz)
+   ExternalSource_Download(ARMCOMPUTE ${ARMCOMPUTE_URL})
+ 
+   set(ARMComputeSource_DIR ${ARMCOMPUTE_SOURCE_DIR} PARENT_SCOPE)
+diff --git a/infra/cmake/packages/FlatBuffersConfig.cmake b/infra/cmake/packages/FlatBuffersConfig.cmake
+index ab0b770..da084e7 100644
+--- a/infra/cmake/packages/FlatBuffersConfig.cmake
++++ b/infra/cmake/packages/FlatBuffersConfig.cmake
+@@ -25,7 +25,8 @@ function(_FlatBuffers_build)
+                       BUILD_DIR   ${CMAKE_BINARY_DIR}/externals/FLATBUFFERS/build
+                       INSTALL_DIR ${EXT_OVERLAY_DIR}
+                       BUILD_FLAGS ${ADDITIONAL_CXX_FLAGS}
+-                      IDENTIFIER  "1.10-fix1"
++                      IDENTIFIER  "1.10-fix2"
++                      EXTRA_OPTS "-DFLATBUFFERS_BUILD_TESTS:BOOL=OFF"
+                       PKG_NAME    "FLATBUFFERS")
+ 
+ endfunction(_FlatBuffers_build)
+diff --git a/infra/cmake/packages/HDF5Config.cmake b/infra/cmake/packages/HDF5Config.cmake
+index e282e0b..19803f1 100644
+--- a/infra/cmake/packages/HDF5Config.cmake
++++ b/infra/cmake/packages/HDF5Config.cmake
+@@ -27,6 +27,7 @@ _HDF5_build()
+ find_path(HDF5_CONFIG_DIR "hdf5-config.cmake"
+           PATHS ${EXT_OVERLAY_DIR}
+           PATH_SUFFIXES
++            cmake
+             share/cmake
+             share/cmake/hdf5
+             cmake/hdf5
+diff --git a/infra/cmake/packages/Pybind11Config.cmake b/infra/cmake/packages/Pybind11Config.cmake
+new file mode 100644
+index 0000000..3061779
+--- /dev/null
++++ b/infra/cmake/packages/Pybind11Config.cmake
+@@ -0,0 +1,21 @@
++function(_Pybind11_import)
++  nnas_find_package(Pybind11Source QUIET)
++
++  if(NOT Pybind11Source_FOUND)
++    set(Pybind11_FOUND FALSE PARENT_SCOPE)
++    return()
++  endif(NOT Pybind11Source_FOUND)
++
++  nnas_include(ExternalBuildTools)
++  ExternalBuild_CMake(CMAKE_DIR   ${Pybind11Source_DIR}
++                      BUILD_DIR   ${CMAKE_BINARY_DIR}/externals/PYBIND11/build
++                      INSTALL_DIR ${EXT_OVERLAY_DIR}
++                      IDENTIFIER  "2.3.0"
++                      PKG_NAME    "PYBIND11")
++
++  find_path(Pybind11_INCLUDE_DIRS NAMES pybind11.h PATHS ${EXT_OVERLAY_DIR} PATH_SUFFIXES include/pybind11)
++
++  set(Pybind11_FOUND TRUE PARENT_SCOPE)
++endfunction(_Pybind11_import)
++
++_Pybind11_import()
+diff --git a/infra/cmake/packages/Pybind11SourceConfig.cmake b/infra/cmake/packages/Pybind11SourceConfig.cmake
+new file mode 100644
+index 0000000..4a9c676
+--- /dev/null
++++ b/infra/cmake/packages/Pybind11SourceConfig.cmake
+@@ -0,0 +1,18 @@
++function(_Pybind11Source_import)
++  if(NOT DOWNLOAD_PYBIND11)
++    set(Pybind11Source_FOUND FALSE PARENT_SCOPE)
++    return()
++  endif(NOT DOWNLOAD_PYBIND11)
++
++  nnas_include(ExternalSourceTools)
++  nnas_include(OptionTools)
++
++  envoption(PYBIND11_URL https://github.com/pybind/pybind11/archive/v2.3.0.tar.gz)
++
++  ExternalSource_Download(PYBIND11 ${PYBIND11_URL})
++
++  set(Pybind11Source_DIR ${PYBIND11_SOURCE_DIR} PARENT_SCOPE)
++  set(Pybind11Source_FOUND TRUE PARENT_SCOPE)
++endfunction(_Pybind11Source_import)
++
++_Pybind11Source_import()
+diff --git a/infra/docker/Dockerfile b/infra/docker/Dockerfile
+index e675b53..052cc4f 100644
+--- a/infra/docker/Dockerfile
++++ b/infra/docker/Dockerfile
+@@ -1,8 +1,6 @@
+ FROM ubuntu:16.04
+ 
+ ARG UBUNTU_MIRROR
+-ENV http_proxy $http_proxy
+-ENV https_proxy $https_proxy
+ 
+ RUN if [ -n "$http_proxy" ] ; then echo "Acquire::http::proxy \"${http_proxy}\";" >> /etc/apt/apt.conf ; fi
+ RUN if [ -n "$https_proxy" ] ; then echo "Acquire::https::proxy \"${https_proxy}\";" >> /etc/apt/apt.conf ; fi
+@@ -22,6 +20,7 @@ RUN apt-get update && apt-get -qqy install libprotobuf-dev protobuf-compiler
+ 
+ # Additonal tools
+ RUN apt-get update && apt-get -qqy install doxygen graphviz wget unzip clang-format-3.9 python3 python3-pip python3-venv hdf5-tools pylint
++RUN pip3 install --upgrade pip
+ RUN pip3 install yapf==0.22.0 numpy
+ 
+ # Install google test (source)
+diff --git a/infra/docker/Dockerfile.1804 b/infra/docker/Dockerfile.1804
+index fc6fc9a..cc31bba 100644
+--- a/infra/docker/Dockerfile.1804
++++ b/infra/docker/Dockerfile.1804
+@@ -1,12 +1,6 @@
+ FROM ubuntu:18.04
+ 
+ ARG UBUNTU_MIRROR
+-ENV http_proxy $http_proxy
+-ENV https_proxy $https_proxy
+-
+-RUN if [ -n "$http_proxy" ] ; then echo "Acquire::http::proxy \"${http_proxy}\";" >> /etc/apt/apt.conf ; fi
+-RUN if [ -n "$https_proxy" ] ; then echo "Acquire::https::proxy \"${https_proxy}\";" >> /etc/apt/apt.conf ; fi
+-RUN if [ -n "$UBUNTU_MIRROR" ] ; then sed "s/archive.ubuntu.com/${UBUNTU_MIRROR}/g" -i /etc/apt/sources.list ; fi
+ 
+ # Install 'add-apt-repository'
+ RUN apt-get update && apt-get -qqy install software-properties-common
+@@ -22,6 +16,7 @@ RUN apt-get update && apt-get -qqy install libprotobuf-dev protobuf-compiler
+ 
+ # Additonal tools
+ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get -qqy install doxygen graphviz wget unzip clang-format-3.9 python3 python3-pip python3-venv hdf5-tools pylint
++RUN pip3 install --upgrade pip
+ RUN pip3 install yapf==0.22.0 numpy
+ 
+ # Install google test (source)
+diff --git a/infra/nncc/CMakeLists.txt b/infra/nncc/CMakeLists.txt
+index 3ac6680..0be6885 100644
+--- a/infra/nncc/CMakeLists.txt
++++ b/infra/nncc/CMakeLists.txt
+@@ -98,6 +98,7 @@ option(DOWNLOAD_CAFFE "Download Caffe source" ON)
+ option(DOWNLOAD_PYTORCH "Download Pytorch source" ON)
+ option(DOWNLOAD_ONNX "Download ONNX source" ON)
+ option(DOWNLOAD_ABSEIL "Download Abseil-cpp source" ON)
++option(DOWNLOAD_PYBIND11 "Download Pybind11 source" ON)
+ 
+ option(DOWNLOAD_GTEST "Download Google Test source" ON)
+ option(BUILD_GTEST "Build Google Test from the downloaded source" ON)
+diff --git a/infra/nncc/command/utcount b/infra/nncc/command/utcount
+index d4610e3..d06c5c9 100644
+--- a/infra/nncc/command/utcount
++++ b/infra/nncc/command/utcount
+@@ -13,7 +13,7 @@ BUILD_ITEMS="angkor cwrap pepper-str pepper-strcast pp stdex \
+ oops pepper-assert \
+ hermes hermes-std \
+ loco locop locomotiv logo-core logo \
+-foder souschef arser \
++foder souschef arser vconone \
+ safemain mio-circle mio-tflite \
+ tflite2circle \
+ luci \
+diff --git a/infra/nnfw/cmake/packages/TensorFlowLite-2.2.0/CMakeLists.txt b/infra/nnfw/cmake/packages/TensorFlowLite-2.2.0/CMakeLists.txt
+index 8e7f78e..2442a2d 100644
+--- a/infra/nnfw/cmake/packages/TensorFlowLite-2.2.0/CMakeLists.txt
++++ b/infra/nnfw/cmake/packages/TensorFlowLite-2.2.0/CMakeLists.txt
+@@ -100,7 +100,7 @@ target_include_directories(tensorflow-lite-2.2.0 SYSTEM PUBLIC ${TFLITE_INCLUDES
+ target_compile_definitions(tensorflow-lite-2.2.0 PUBLIC "GEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK -DTFLITE_WITH_RUY -DTFLITE_WITH_RUY_GEMV")
+ set_property(TARGET tensorflow-lite-2.2.0 PROPERTY POSITION_INDEPENDENT_CODE ON)
+ target_link_libraries(tensorflow-lite-2.2.0 eigen ${LIB_PTHREAD} dl)
+-if(${BUILD_WITH_NNAPI})
++if(NOT ANDROID AND ${BUILD_WITH_NNAPI})
+   target_link_libraries(tensorflow-lite-2.2.0 rt)
+ endif()
+ 
+diff --git a/infra/nnfw/config/gbs.conf b/infra/nnfw/config/gbs.conf
+index 515cada..bad9eb2 100644
+--- a/infra/nnfw/config/gbs.conf
++++ b/infra/nnfw/config/gbs.conf
+@@ -5,7 +5,7 @@ profile = profile.tizen
+ [profile.tizen]
+ user=obs_viewer
+ obs = obs.tizen
+-repos = repo.tizen_base,repo.tizen_mobile
++repos = repo.tizen_one,repo.tizen_base,repo.tizen_mobile
+ buildroot = /home/GBS-ROOT/
+ 
+ [obs.tizen]
+@@ -15,6 +15,8 @@ url = http://api.tizen.org
+ url = http://download.tizen.org/snapshots/tizen/unified/latest/repos/standard/packages/
+ 
+ [repo.tizen_base]
+-url =  http://download.tizen.org/snapshots/tizen/base/latest/repos/standard/packages/
++url = http://download.tizen.org/snapshots/tizen/base/latest/repos/standard/packages/
+ 
++[repo.tizen_one]
++url = http://nnfw.mooo.com/archive/tizen/
+ 
+diff --git a/infra/packaging/preset/20200630 b/infra/packaging/preset/20200630
+index e159935..c3ca4b6 100644
+--- a/infra/packaging/preset/20200630
++++ b/infra/packaging/preset/20200630
+@@ -14,6 +14,7 @@ function preset_configure()
+   REQUIRED_UNITS+=("souschef")
+   REQUIRED_UNITS+=("safemain")
+   REQUIRED_UNITS+=("arser")
++  REQUIRED_UNITS+=("vconone")
+   # Hermes Logging Framework
+   REQUIRED_UNITS+=("hermes" "hermes-std")
+   # loco IR and related utilities
+@@ -28,11 +29,14 @@ function preset_configure()
+   REQUIRED_UNITS+=("record-minmax" "circle-quantizer")
+   REQUIRED_UNITS+=("one-cmds")
+ 
++  NPROC=${NPROC:-$(cat /proc/cpuinfo | grep -c processor)}
++
+   # TODO Use "nncc configure" and "nncc build"
+   cmake \
+     -DCMAKE_INSTALL_PREFIX="${NNCC_INSTALL_PREFIX}" \
+     -DCMAKE_BUILD_TYPE=release \
+     -DBUILD_WHITELIST=$(join_by ";" "${REQUIRED_UNITS[@]}") \
++    -DEXTERNALS_BUILD_THREADS=$((NPROC/2)) \
+     ${EXTRA_OPTIONS[@]} \
+     "${NNAS_PROJECT_PATH}/infra/nncc"
+ }
+@@ -44,14 +48,4 @@ function preset_install()
+ 
+   # Install tf2nnpkg
+   install -T -m 755 -D "${SCRIPT_PATH}/res/tf2nnpkg.${PRESET}" "${NNAS_INSTALL_PREFIX}/bin/tf2nnpkg"
+-
+-  # Create python virtual enviornment
+-  python3 -m venv "${NNAS_INSTALL_PREFIX}/bin/venv"
+-
+-  # Install tensorflow
+-  source "${NNAS_INSTALL_PREFIX}/bin/venv/bin/activate"
+-  python -m pip --default-timeout=1000 --trusted-host pypi.org --trusted-host files.pythonhost.org \
+-    install -U pip setuptools
+-  python -m pip --default-timeout=1000 --trusted-host pypi.org --trusted-host files.pythonhost.org \
+-    install tensorflow-cpu==2.3.0rc0
+ }
+diff --git a/infra/packaging/res/tf2nnpkg.20200630 b/infra/packaging/res/tf2nnpkg.20200630
+index 9101f82..7846fd3 100644
+--- a/infra/packaging/res/tf2nnpkg.20200630
++++ b/infra/packaging/res/tf2nnpkg.20200630
+@@ -14,10 +14,16 @@ command_exists() {
+ usage()
+ {
+   echo "Convert TensorFlow model to nnpackage."
+-  echo "Usage: tf2nnpkg --info <path/to/info> --graphdef <path/to/pb> [OPTION] -o <path/to/nnpkg/directory>"
+-  exit 0
++  echo "Usage: tf2nnpkg"
++  echo "    --info <path/to/info>"
++  echo "    --graphdef <path/to/pb>"
++  echo "    -o <path/to/nnpkg/directory>"
++  echo "    --v2 (optional) Use TF 2.x interface"
++  exit 255
+ }
+ 
++TF_INTERFACE="--v1"
++
+ # Parse command-line arguments
+ #
+ while [ "$#" -ne 0 ]; do
+@@ -39,6 +45,10 @@ while [ "$#" -ne 0 ]; do
+       export OUTPUT_DIR="$2"
+       shift 2
+       ;;
++    '--v2')
++      TF_INTERFACE="--v2"
++      shift
++      ;;
+     *)
+       echo "${CUR}"
+       shift
+@@ -83,10 +93,7 @@ OUTPUT=$(awk -F, '/^output/ { print $2 }' ${INFO_FILE} | cut -d: -f1 | tr -d ' '
+ INPUT_SHAPES=$(grep ^input ${INFO_FILE} | cut -d "[" -f2 | cut -d "]" -f1 | tr -d ' ' | xargs | tr ' ' ':')
+ 
+ # generate tflite file
+-python "${ROOT}/bin/tf2tfliteV2.py" --v2 --input_path ${GRAPHDEF_FILE} \
+---output_path "${TMPDIR}/${MODEL_NAME}.tflite" \
+---input_arrays ${INPUT} --output_arrays ${OUTPUT} || \
+-python "${ROOT}/bin/tf2tfliteV2.py" --v1 --input_path ${GRAPHDEF_FILE} \
++python "${ROOT}/bin/tf2tfliteV2.py" ${TF_INTERFACE} --input_path ${GRAPHDEF_FILE} \
+ --output_path "${TMPDIR}/${MODEL_NAME}.tflite" \
+ --input_arrays ${INPUT} --input_shapes ${INPUT_SHAPES} \
+ --output_arrays ${OUTPUT}
+diff --git a/infra/scripts/build-tcm.sh b/infra/scripts/build-tcm.sh
+new file mode 100755
+index 0000000..22fb335
+--- /dev/null
++++ b/infra/scripts/build-tcm.sh
+@@ -0,0 +1,24 @@
++#!/bin/bash
++#
++# STEP 1
++#   Download latest TCM tool from 
++#   https://github.sec.samsung.net/RS-TCM/tca-standalone/releases/download/v0.0.8/tca-standalone-0.0.8.jar
++#
++# STEP 2
++#   Create symbolic link `./src` for source directory to be analyzed which has `.ahub` configuration.
++#
++# STEP 3
++#   run this `build-tcm.sh` script.
++#
++# See the following link for additional details.
++#   https://github.sec.samsung.net/RS-TCM/tca-standalone/wiki/Tutorials-CPP-Gtest
++#
++
++echo ${PROJECT_DIR:=${PWD}}
++
++java -jar $PROJECT_DIR/tca-standalone-0.0.8.jar \
++  --outdir=$PROJECT_DIR/tcm-output \
++  --config=$PROJECT_DIR/.ahub/tcchecker-tca/config.yaml \
++  --local=$PROJECT_DIR/src \
++  --logfile=$PROJECT_DIR/tcm-output/tcm.log \
++  --debug
+diff --git a/infra/scripts/compiler_modules.sh b/infra/scripts/compiler_modules.sh
+index d436e8a..a0323e0 100644
+--- a/infra/scripts/compiler_modules.sh
++++ b/infra/scripts/compiler_modules.sh
+@@ -7,7 +7,7 @@ DEBUG_BUILD_ITEMS="angkor;cwrap;pepper-str;pepper-strcast;pp;stdex"
+ DEBUG_BUILD_ITEMS+=";oops;pepper-assert"
+ DEBUG_BUILD_ITEMS+=";hermes;hermes-std"
+ DEBUG_BUILD_ITEMS+=";loco;locop;locomotiv;logo-core;logo"
+-DEBUG_BUILD_ITEMS+=";foder;souschef;arser"
++DEBUG_BUILD_ITEMS+=";foder;souschef;arser;vconone"
+ DEBUG_BUILD_ITEMS+=";safemain;mio-circle;mio-tflite"
+ DEBUG_BUILD_ITEMS+=";tflite2circle"
+ DEBUG_BUILD_ITEMS+=";luci"
+diff --git a/infra/scripts/docker_build_cross_aarch64_runtime.sh b/infra/scripts/docker_build_cross_aarch64_runtime.sh
+index 7da6736..011d14c 100755
+--- a/infra/scripts/docker_build_cross_aarch64_runtime.sh
++++ b/infra/scripts/docker_build_cross_aarch64_runtime.sh
+@@ -6,7 +6,7 @@ CURRENT_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ ROOT_PATH="$CURRENT_PATH/../../"
+ 
+ # prepare rootfs
+-if [ ! -d $ROOTFS_DIR ]; then
++if [ -z "$ROOTFS_DIR" ] || [ ! -d $ROOTFS_DIR ]; then
+   echo "It will use default rootfs path"
+ else
+   DOCKER_VOLUMES+=" -v $ROOTFS_DIR:/opt/rootfs"
+diff --git a/infra/scripts/docker_build_cross_arm_runtime.sh b/infra/scripts/docker_build_cross_arm_runtime.sh
+index f1f666a..551fb57 100755
+--- a/infra/scripts/docker_build_cross_arm_runtime.sh
++++ b/infra/scripts/docker_build_cross_arm_runtime.sh
+@@ -6,7 +6,7 @@ CURRENT_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ ROOT_PATH="$CURRENT_PATH/../../"
+ 
+ # prepare rootfs
+-if [ ! -d $ROOTFS_DIR ]; then
++if [ -z "$ROOTFS_DIR" ] || [ ! -d $ROOTFS_DIR ]; then
+   echo "It will use default rootfs path"
+ else
+   DOCKER_VOLUMES+=" -v $ROOTFS_DIR:/opt/rootfs"
+diff --git a/infra/scripts/docker_build_cross_arm_runtime_release.sh b/infra/scripts/docker_build_cross_arm_runtime_release.sh
+index ea66f17..876f318 100755
+--- a/infra/scripts/docker_build_cross_arm_runtime_release.sh
++++ b/infra/scripts/docker_build_cross_arm_runtime_release.sh
+@@ -6,7 +6,7 @@ CURRENT_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ ROOT_PATH="$CURRENT_PATH/../../"
+ 
+ # prepare rootfs
+-if [ ! -d $ROOTFS_DIR ]; then
++if [ -z "$ROOTFS_DIR" ] || [ ! -d $ROOTFS_DIR ]; then
+   echo "It will use default rootfs path"
+ else
+   DOCKER_VOLUMES+=" -v $ROOTFS_DIR:/opt/rootfs"
+diff --git a/infra/scripts/docker_build_cross_coverage.sh b/infra/scripts/docker_build_cross_coverage.sh
+index 08244e5..f42251b 100755
+--- a/infra/scripts/docker_build_cross_coverage.sh
++++ b/infra/scripts/docker_build_cross_coverage.sh
+@@ -6,7 +6,7 @@ CURRENT_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ ROOT_PATH="$CURRENT_PATH/../../"
+ 
+ # prepare rootfs
+-if [ ! -d $ROOTFS_DIR ]; then
++if [ -z "$ROOTFS_DIR" ] || [ ! -d $ROOTFS_DIR ]; then
+   echo "It will use default rootfs path"
+ else
+   DOCKER_VOLUMES+=" -v $ROOTFS_DIR:/opt/rootfs"
+diff --git a/infra/scripts/docker_build_nncc.sh b/infra/scripts/docker_build_nncc.sh
+index 418b50d..5b12531 100755
+--- a/infra/scripts/docker_build_nncc.sh
++++ b/infra/scripts/docker_build_nncc.sh
+@@ -54,6 +54,16 @@ pushd $ROOT_PATH > /dev/null
+ mkdir -p ${NNCC_INSTALL_PREFIX}
+ ./nncc docker-run ./nnas create-package --prefix "${PWD}/${NNCC_INSTALL_PREFIX}" -- "${CONFIG_OPTIONS}"
+ 
++# create python virtual environment
++./nncc docker-run python3 -m venv "${NNCC_INSTALL_PREFIX}/bin/venv"
++
++./nncc docker-run "${NNCC_INSTALL_PREFIX}/bin/venv/bin/python" \
++  -m pip --default-timeout=1000 --trusted-host pypi.org --trusted-host files.pythonhost.org \
++  install -U pip setuptools
++./nncc docker-run "${NNCC_INSTALL_PREFIX}/bin/venv/bin/python" \
++  -m pip --default-timeout=1000 --trusted-host pypi.org --trusted-host files.pythonhost.org \
++  install tensorflow-cpu==2.3.0rc0
++
+ mkdir -p ${ARCHIVE_PATH}
+ tar -zcf ${ARCHIVE_PATH}/nncc-package.tar.gz -C ${NNCC_INSTALL_PREFIX} ./
+ 
+diff --git a/infra/scripts/docker_build_tizen_cross.sh b/infra/scripts/docker_build_tizen_cross.sh
+index 18809ad..ee0f183 100755
+--- a/infra/scripts/docker_build_tizen_cross.sh
++++ b/infra/scripts/docker_build_tizen_cross.sh
+@@ -6,7 +6,7 @@ CURRENT_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ ROOT_PATH="$CURRENT_PATH/../../"
+ 
+ # prepare rootfs
+-if [ ! -d $ROOTFS_DIR ]; then
++if [ -z "$ROOTFS_DIR" ] || [ ! -d $ROOTFS_DIR ]; then
+   echo "It will use default rootfs path"
+ else
+   DOCKER_VOLUMES+=" -v $ROOTFS_DIR:/opt/rootfs"
+diff --git a/infra/scripts/docker_collect_nnpkg_resources.sh b/infra/scripts/docker_collect_nnpkg_resources.sh
+index 556c5bd..55adaa1 100755
+--- a/infra/scripts/docker_collect_nnpkg_resources.sh
++++ b/infra/scripts/docker_collect_nnpkg_resources.sh
+@@ -60,7 +60,7 @@ pushd $ROOT_PATH > /dev/null
+ REQUIRED_UNITS=()
+ # Common Libraries
+ REQUIRED_UNITS+=("angkor" "cwrap" "pepper-str" "pepper-strcast" "pp" "stdex")
+-REQUIRED_UNITS+=("oops" "safemain" "foder" "arser" "oops")
++REQUIRED_UNITS+=("oops" "safemain" "foder" "arser" "vconone")
+ # Hermes Logging Framework
+ REQUIRED_UNITS+=("hermes" "hermes-std")
+ # loco IR and related utilities
+diff --git a/infra/scripts/tizen_xu4_test.sh b/infra/scripts/tizen_xu4_test.sh
+index 5521b5f..640a0e0 100755
+--- a/infra/scripts/tizen_xu4_test.sh
++++ b/infra/scripts/tizen_xu4_test.sh
+@@ -23,7 +23,7 @@ function install_model()
+ {
+     # download tflite model files
+     pushd $HOST_HOME
+-    tests/scripts/framework/run_test.sh --download=on
++    tests/scripts/framework/run_test.sh --download=on --run=off
+     # TODO Since this command removes model file(.zip),
+     # We must always download the file unlike model file(.tflite).
+     # Because caching applies only to tflite file.
+diff --git a/packaging/nnfw.spec b/packaging/nnfw.spec
+index ce1cd0b..e26ffcb 100644
+--- a/packaging/nnfw.spec
++++ b/packaging/nnfw.spec
+@@ -30,7 +30,7 @@ BuildRequires:  flatbuffers-devel
+ %ifarch %{arm} aarch64
+ # Require python for acl-ex library build pre-process
+ BuildRequires:  python
+-BuildRequires:  libarmcl-devel
++BuildRequires:  libarmcl-devel >= v20.05
+ %endif
+ 
+ Requires(post): /sbin/ldconfig
+diff --git a/res/TensorFlowLiteRecipes/AveragePool2D_U8_000/test.recipe b/res/TensorFlowLiteRecipes/AveragePool2D_U8_000/test.recipe
+new file mode 100644
+index 0000000..7322e90
+--- /dev/null
++++ b/res/TensorFlowLiteRecipes/AveragePool2D_U8_000/test.recipe
+@@ -0,0 +1,26 @@
++operand {
++  name: "ifm"
++  type: UINT8
++  shape { dim: 1 dim: 8 dim: 8 dim: 1 }
++  quant { min: 0 max: 255 scale: 1.0 zero_point: 0 }
++}
++operand {
++  name: "ofm"
++  type: UINT8
++  shape { dim: 1 dim: 7 dim: 7 dim: 1 }
++  quant { min: 0 max: 255 scale: 1.0 zero_point: 0 }
++}
++operation {
++  type: "AveragePool2D"
++  averagepool2d_options {
++    padding: VALID
++    stride_w: 1
++    stride_h: 1
++    filter_width: 2
++    filter_height: 2
++  }
++  input: "ifm"
++  output: "ofm"
++}
++input: "ifm"
++output: "ofm"
+diff --git a/res/TensorFlowLiteRecipes/AveragePool2D_U8_000/test.reverse b/res/TensorFlowLiteRecipes/AveragePool2D_U8_000/test.reverse
+new file mode 100644
+index 0000000..e69de29
+diff --git a/res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.recipe b/res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.recipe
+new file mode 100644
+index 0000000..a09afc1
+--- /dev/null
++++ b/res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.recipe
+@@ -0,0 +1,44 @@
++operand {
++  name: "ifm"
++  type: FLOAT32
++  shape { dim: 1 dim: 4 dim: 5 dim: 5 }
++}
++operand {
++  name: "ker"
++  type: FLOAT32
++  shape { dim: 1 dim: 1 dim: 2 dim: 25 }
++}
++operand {
++  name: "bias"
++  type: FLOAT32
++  shape { dim: 25 }
++  filler {
++    tag: "constant"
++    arg: "1.1"
++  }
++}
++operand {
++  name: "ofm"
++  type: FLOAT32
++  shape { dim: 1 dim: 2 dim: 2 dim: 25 }
++}
++operation {
++  type: "DepthwiseConv2D"
++  version: 2
++  depthwiseconv2d_options {
++    padding: VALID
++    stride_w: 2
++    stride_h: 2
++    dilation_w_factor: 2
++    dilation_h_factor: 1
++    depth_multiplier: 5
++    activation : RELU6
++  }
++  input: "ifm"
++  input: "ker"
++  input: "bias"
++  output: "ofm"
++}
++input: "ifm"
++input: "ker"
++output: "ofm"
+diff --git a/res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.reverse b/res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.reverse
+new file mode 100644
+index 0000000..e69de29
+diff --git a/res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.rule b/res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.rule
+new file mode 100644
+index 0000000..edfabc6
+--- /dev/null
++++ b/res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.rule
+@@ -0,0 +1,3 @@
++# To check if DEPTHWISE_CONV_2D version is 2
++
++RULE    "OP_VERSION_CHECK"        $(op_version DEPTHWISE_CONV_2D) '=' 2
+diff --git a/res/TensorFlowLiteRecipes/DepthwiseConv2D_U8_001/test.recipe b/res/TensorFlowLiteRecipes/DepthwiseConv2D_U8_001/test.recipe
+new file mode 100644
+index 0000000..5e0b6b5
+--- /dev/null
++++ b/res/TensorFlowLiteRecipes/DepthwiseConv2D_U8_001/test.recipe
+@@ -0,0 +1,61 @@
++operand {
++  name: "ifm"
++  type: UINT8
++  shape { dim: 1 dim: 112 dim: 112 dim: 4 }
++  quant { min: 0 max: 6 scale: 0.0235294 zero_point: 0 }
++}
++operand {
++  name: "ker"
++  type: UINT8
++  shape { dim: 1 dim: 3 dim: 3 dim: 4 }
++  filler {
++    tag: "gaussian"
++    arg: "0.0"
++    arg: "1.0"
++  }
++  quant {
++    min: -30.3175 min: -0.779597 min: -10.2751 min: -10.8594
++    max: 4.35049 max: 2.70807 max: 11.0269 max: 20.97
++    scale:0.135953 scale: 0.0136771 scale: 0.0835375 scale: 0.124821
++    zero_point:223 zero_point: 57 zero_point: 123 zero_point: 87
++    quantized_dimension: 3
++  }
++}
++operand {
++  name: "bias"
++  type: INT32
++  shape { dim: 4 }
++	filler {
++	  tag: "gaussian"
++	  arg: "0"
++	  arg: "1.0"
++	}
++  quant {
++    scale: 1.4758e-16 scale: 3.15185e-05 scale: 2.20685e-05 scale: 1.72205e-16
++    zero_point: 0 zero_point: 0 zero_point: 0 zero_point: 0
++  }
++}
++operand {
++  name: "ofm"
++  type: UINT8
++  shape { dim: 1 dim: 112 dim: 112 dim: 4 }
++  quant { min: 0 max: 6 scale: 0.0235294 zero_point: 0 }
++
++}
++operation {
++  type: "DepthwiseConv2D"
++  depthwiseconv2d_options {
++    padding: SAME
++    stride_w: 1
++    stride_h: 1
++    depth_multiplier: 1
++    activation : RELU6
++  }
++  input: "ifm"
++  input: "ker"
++  input: "bias"
++  output: "ofm"
++}
++input: "ifm"
++input: "ker"
++output: "ofm"
+diff --git a/res/TensorFlowLiteRecipes/DepthwiseConv2D_U8_001/test.reverse b/res/TensorFlowLiteRecipes/DepthwiseConv2D_U8_001/test.reverse
+new file mode 100644
+index 0000000..e69de29
+diff --git a/res/TensorFlowLiteRecipes/L2Normalize_U8_000/test.recipe b/res/TensorFlowLiteRecipes/L2Normalize_U8_000/test.recipe
+new file mode 100644
+index 0000000..3fff5cd
+--- /dev/null
++++ b/res/TensorFlowLiteRecipes/L2Normalize_U8_000/test.recipe
+@@ -0,0 +1,22 @@
++operand {
++  name: "ifm1"
++  type: UINT8
++  shape { dim: 1 dim: 4 dim: 4 dim: 3 }
++  quant { min: 0 max: 2 scale: 0.0078125 zero_point: 128}
++}
++operand {
++  name: "ofm"
++  type: UINT8
++  shape { dim: 1 dim: 4 dim: 4 dim: 3 }
++  quant { min: 0 max: 2 scale: 0.0078125 zero_point: 128}
++}
++operation {
++  type: "L2Normalize"
++  l2norm_options {
++    activation: NONE
++  }
++  input: "ifm1"
++  output: "ofm"
++}
++input: "ifm1"
++output: "ofm"
+diff --git a/res/TensorFlowLiteRecipes/L2Normalize_U8_000/test.reverse b/res/TensorFlowLiteRecipes/L2Normalize_U8_000/test.reverse
+new file mode 100644
+index 0000000..e69de29
+diff --git a/res/TensorFlowLiteRecipes/Logistic_U8_000/test.recipe b/res/TensorFlowLiteRecipes/Logistic_U8_000/test.recipe
+new file mode 100644
+index 0000000..7b2a84d
+--- /dev/null
++++ b/res/TensorFlowLiteRecipes/Logistic_U8_000/test.recipe
+@@ -0,0 +1,19 @@
++operand {
++  name: "ifm"
++  type: UINT8
++  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
++  quant { min: 0 max: 1 scale: 0.00390625 zero_point: -128 }
++}
++operand {
++  name: "ofm"
++  type: UINT8
++  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
++  quant { min: 0 max: 1 scale: 0.00390625 zero_point: -128 }
++}
++operation {
++  type: "Logistic"
++  input: "ifm"
++  output: "ofm"
++}
++input: "ifm"
++output: "ofm"
+diff --git a/res/TensorFlowLiteRecipes/Logistic_U8_000/test.reverse b/res/TensorFlowLiteRecipes/Logistic_U8_000/test.reverse
+new file mode 100644
+index 0000000..e69de29
+diff --git a/res/TensorFlowLiteRecipes/TransposeConv_000/test.recipe b/res/TensorFlowLiteRecipes/TransposeConv_000/test.recipe
+index 79271a4..1313e26 100644
+--- a/res/TensorFlowLiteRecipes/TransposeConv_000/test.recipe
++++ b/res/TensorFlowLiteRecipes/TransposeConv_000/test.recipe
+@@ -10,7 +10,7 @@ operand {
+ operand {
+   name: "ker"
+   type: FLOAT32
+-  shape { dim: 1 dim: 3 dim: 3 dim: 1 }
++  shape { dim: 3 dim: 1 dim: 1 dim: 3 }
+   filler {
+     tag: "gaussian"
+     arg: "0.0"
+diff --git a/res/TensorFlowLiteRecipes/Unique_000/test.recipe b/res/TensorFlowLiteRecipes/Unique_000/test.recipe
+new file mode 100644
+index 0000000..887380c
+--- /dev/null
++++ b/res/TensorFlowLiteRecipes/Unique_000/test.recipe
+@@ -0,0 +1,27 @@
++operand {
++  name: "ifm"
++  type: FLOAT32
++  shape { dim: 4 }
++}
++operand {
++  name: "ofm"
++  type: FLOAT32
++  shape { }
++}
++operand {
++  name: "ofm_idx"
++  type: INT32
++  shape { dim: 4 }
++}
++operation {
++  type: "Unique"
++  unique_options {
++    idx_out_type: INT32
++  }
++  input: "ifm"
++  output: "ofm"
++  output: "ofm_idx"
++}
++input: "ifm"
++output: "ofm"
++output: "ofm_idx"
+diff --git a/res/TensorFlowLiteRecipes/Unique_000/test.reverse b/res/TensorFlowLiteRecipes/Unique_000/test.reverse
+new file mode 100644
+index 0000000..e69de29
+diff --git a/res/TensorFlowLiteRecipes/Unique_001/test.recipe b/res/TensorFlowLiteRecipes/Unique_001/test.recipe
+new file mode 100644
+index 0000000..9beb516
+--- /dev/null
++++ b/res/TensorFlowLiteRecipes/Unique_001/test.recipe
+@@ -0,0 +1,27 @@
++operand {
++  name: "ifm"
++  type: FLOAT32
++  shape { dim: 4 }
++}
++operand {
++  name: "ofm"
++  type: FLOAT32
++  shape { }
++}
++operand {
++  name: "ofm_idx"
++  type: INT64
++  shape { dim: 4 }
++}
++operation {
++  type: "Unique"
++  unique_options {
++    idx_out_type: INT64
++  }
++  input: "ifm"
++  output: "ofm"
++  output: "ofm_idx"
++}
++input: "ifm"
++output: "ofm"
++output: "ofm_idx"
+diff --git a/res/TensorFlowLiteRecipes/Unique_001/test.reverse b/res/TensorFlowLiteRecipes/Unique_001/test.reverse
+new file mode 100644
+index 0000000..e69de29
+diff --git a/res/TensorFlowLiteRecipes/Unique_002/test.recipe b/res/TensorFlowLiteRecipes/Unique_002/test.recipe
+new file mode 100644
+index 0000000..67b947f
+--- /dev/null
++++ b/res/TensorFlowLiteRecipes/Unique_002/test.recipe
+@@ -0,0 +1,27 @@
++operand {
++  name: "ifm"
++  type: INT32
++  shape { dim: 5 }
++}
++operand {
++  name: "ofm"
++  type: INT32
++  shape { }
++}
++operand {
++  name: "ofm_idx"
++  type: INT32
++  shape { dim: 5 }
++}
++operation {
++  type: "Unique"
++  unique_options {
++    idx_out_type: INT32
++  }
++  input: "ifm"
++  output: "ofm"
++  output: "ofm_idx"
++}
++input: "ifm"
++output: "ofm"
++output: "ofm_idx"
+diff --git a/res/TensorFlowLiteRecipes/Unique_002/test.reverse b/res/TensorFlowLiteRecipes/Unique_002/test.reverse
+new file mode 100644
+index 0000000..e69de29
+diff --git a/res/TensorFlowLiteRecipes/Unique_003/test.recipe b/res/TensorFlowLiteRecipes/Unique_003/test.recipe
+new file mode 100644
+index 0000000..375db66
+--- /dev/null
++++ b/res/TensorFlowLiteRecipes/Unique_003/test.recipe
+@@ -0,0 +1,27 @@
++operand {
++  name: "ifm"
++  type: INT32
++  shape { dim: 5 }
++}
++operand {
++  name: "ofm"
++  type: INT32
++  shape { }
++}
++operand {
++  name: "ofm_idx"
++  type: INT64
++  shape { dim: 5 }
++}
++operation {
++  type: "Unique"
++  unique_options {
++    idx_out_type: INT64
++  }
++  input: "ifm"
++  output: "ofm"
++  output: "ofm_idx"
++}
++input: "ifm"
++output: "ofm"
++output: "ofm_idx"
+diff --git a/res/TensorFlowLiteRecipes/Unique_003/test.reverse b/res/TensorFlowLiteRecipes/Unique_003/test.reverse
+new file mode 100644
+index 0000000..e69de29
+diff --git a/res/TensorFlowLiteRecipes/Unique_U8_000/test.recipe b/res/TensorFlowLiteRecipes/Unique_U8_000/test.recipe
+new file mode 100644
+index 0000000..d3985e4
+--- /dev/null
++++ b/res/TensorFlowLiteRecipes/Unique_U8_000/test.recipe
+@@ -0,0 +1,28 @@
++operand {
++  name: "ifm"
++  type: UINT8
++  shape { dim: 4 }
++  quant { min: 0 max: 255 scale: 1.0 zero_point: 0 }
++}
++operand {
++  name: "ofm"
++  type: UINT8
++  shape { }
++}
++operand {
++  name: "ofm_idx"
++  type: INT32
++  shape { dim: 4 }
++}
++operation {
++  type: "Unique"
++  unique_options {
++    idx_out_type: INT32
++  }
++  input: "ifm"
++  output: "ofm"
++  output: "ofm_idx"
++}
++input: "ifm"
++output: "ofm"
++output: "ofm_idx"
+diff --git a/res/TensorFlowLiteRecipes/Unique_U8_000/test.reverse b/res/TensorFlowLiteRecipes/Unique_U8_000/test.reverse
+new file mode 100644
+index 0000000..e69de29
+diff --git a/res/TensorFlowLiteRecipes/Unique_U8_001/test.recipe b/res/TensorFlowLiteRecipes/Unique_U8_001/test.recipe
+new file mode 100644
+index 0000000..b08dd85
+--- /dev/null
++++ b/res/TensorFlowLiteRecipes/Unique_U8_001/test.recipe
+@@ -0,0 +1,28 @@
++operand {
++  name: "ifm"
++  type: UINT8
++  shape { dim: 5 }
++  quant { min: 0 max: 255 scale: 1.0 zero_point: 0 }
++}
++operand {
++  name: "ofm"
++  type: UINT8
++  shape { }
++}
++operand {
++  name: "ofm_idx"
++  type: INT64
++  shape { dim: 5 }
++}
++operation {
++  type: "Unique"
++  unique_options {
++    idx_out_type: INT64
++  }
++  input: "ifm"
++  output: "ofm"
++  output: "ofm_idx"
++}
++input: "ifm"
++output: "ofm"
++output: "ofm_idx"
+diff --git a/res/TensorFlowLiteRecipes/Unique_U8_001/test.reverse b/res/TensorFlowLiteRecipes/Unique_U8_001/test.reverse
+new file mode 100644
+index 0000000..e69de29
+diff --git a/runtime/libs/benchmark/CMakeLists.txt b/runtime/libs/benchmark/CMakeLists.txt
+index 2af0ffa..748b2d1 100644
+--- a/runtime/libs/benchmark/CMakeLists.txt
++++ b/runtime/libs/benchmark/CMakeLists.txt
+@@ -1,6 +1,5 @@
+ file(GLOB_RECURSE SOURCES "src/*.cpp")
+ 
+-add_library(nnfw_lib_benchmark SHARED ${SOURCES})
++add_library(nnfw_lib_benchmark STATIC ${SOURCES})
+ target_include_directories(nnfw_lib_benchmark PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include)
+ target_link_libraries(nnfw_lib_benchmark PRIVATE ${LIB_PTHREAD})
+-install(TARGETS nnfw_lib_benchmark DESTINATION lib)
+diff --git a/runtime/libs/benchmark/src/Result.cpp b/runtime/libs/benchmark/src/Result.cpp
+index 7a3f9a5..df573da 100644
+--- a/runtime/libs/benchmark/src/Result.cpp
++++ b/runtime/libs/benchmark/src/Result.cpp
+@@ -166,7 +166,7 @@ Result::Result(const Phases &phases)
+   if (option.memory)
+   {
+     print_memory = true;
+-    for (int i = PhaseEnum::MODEL_LOAD; i <= PhaseEnum::EXECUTE; ++i)
++    for (int i = PhaseEnum::MODEL_LOAD; i < PhaseEnum::EXECUTE; ++i)
+     {
+       auto phase = phases.at(gPhaseStrings[i]);
+       for (int j = MemoryType::RSS; j <= MemoryType::PSS; ++j)
+diff --git a/runtime/onert/api/include/nnfw.h b/runtime/onert/api/include/nnfw.h
+index 031aabd..03a3aed 100644
+--- a/runtime/onert/api/include/nnfw.h
++++ b/runtime/onert/api/include/nnfw.h
+@@ -99,6 +99,8 @@ typedef enum {
+   NNFW_STATUS_ERROR = 1,
+   /** Unexpected null argument is given. */
+   NNFW_STATUS_UNEXPECTED_NULL = 2,
++  /** When a function was called but it is not valid for the current session state. */
++  NNFW_STATUS_INVALID_STATE = 3,
+ } NNFW_STATUS;
+ 
+ /**
+@@ -432,10 +434,10 @@ NNFW_STATUS nnfw_output_tensorinfo(nnfw_session *session, uint32_t index,
+  *
+  * <p>Supported backends differs on each platforms.
+  * For example, `x86_64` supports "cpu" only.
+- * Can set multiple backends by semicolon (ex: "acl_cl;cpu").
+- * Among the multiple backends, the 1st element is used as default backend.</p>
+- *
+- * @note      Possible backend strings are: "cpu", "acl_cl", "acl_neon", "srcn"
++ * Multiple backends can be set and they must be separated by a semicolon (ex: "acl_cl;cpu").
++ * For each backend string, `libbackend_{backend}.so` will be dynamically loaded during
++ * {@link nnfw_prepare}.
++ * Among the multiple backends, the 1st element is used as the default backend.</p>
+  *
+  * @param[in] session session to which avilable backends are set
+  * @param[in] backends available backends on which nnfw uses
+@@ -449,12 +451,10 @@ NNFW_STATUS nnfw_set_available_backends(nnfw_session *session, const char *backe
+  *
+  * This function should be called before {@link nnfw_prepare} is invoked.
+  *
+- * <p>Supported backends differs on each platforms.
+- * For example, `x86_64` supports "cpu" only.
+- * The backend for op has higher priority than available backends specified by
+- * nnfw_set_available_backends.</p>
++ * <p>The backend for op has higher priority than available backends specified by
++ * {@link nnfw_set_available_backends}.</p>
+  *
+- * @note      Possible backend strings are: "cpu", "acl_cl", "acl_neon"
++ * @deprecated Deprecated since 1.8.0.
+  *
+  * @param[in] session session to be modified
+  * @param[in] op operation to be set
+diff --git a/runtime/onert/api/src/nnfw_api.cc b/runtime/onert/api/src/nnfw_api.cc
+index 0747583..34a46ed 100644
+--- a/runtime/onert/api/src/nnfw_api.cc
++++ b/runtime/onert/api/src/nnfw_api.cc
+@@ -31,6 +31,7 @@ STATIC_ASSERT_ENUM_CHECK(NNFW_TYPE_TENSOR_INT64, 5);
+ STATIC_ASSERT_ENUM_CHECK(NNFW_STATUS_NO_ERROR, 0);
+ STATIC_ASSERT_ENUM_CHECK(NNFW_STATUS_ERROR, 1);
+ STATIC_ASSERT_ENUM_CHECK(NNFW_STATUS_UNEXPECTED_NULL, 2);
++STATIC_ASSERT_ENUM_CHECK(NNFW_STATUS_INVALID_STATE, 3);
+ 
+ STATIC_ASSERT_ENUM_CHECK(NNFW_LAYOUT_NONE, 0);
+ STATIC_ASSERT_ENUM_CHECK(NNFW_LAYOUT_CHANNELS_LAST, 1);
+diff --git a/runtime/onert/api/src/nnfw_api_internal.cc b/runtime/onert/api/src/nnfw_api_internal.cc
+index d03ddd4..b3390fa 100644
+--- a/runtime/onert/api/src/nnfw_api_internal.cc
++++ b/runtime/onert/api/src/nnfw_api_internal.cc
+@@ -76,7 +76,7 @@ nnfw_session::~nnfw_session() = default;
+ NNFW_STATUS nnfw_session::load_model_from_file(const char *package_dir)
+ {
+   if (!isStateInitialized())
+-    return NNFW_STATUS_ERROR;
++    return NNFW_STATUS_INVALID_STATE;
+ 
+   if (!package_dir)
+   {
+@@ -156,7 +156,7 @@ NNFW_STATUS nnfw_session::prepare()
+       std::cerr << "invalid state";
+     }
+     std::cerr << std::endl;
+-    return NNFW_STATUS_ERROR;
++    return NNFW_STATUS_INVALID_STATE;
+   }
+ 
+   if (!_subgraphs || !primary_subgraph() || primary_subgraph()->isBuildingPhase())
+@@ -188,7 +188,7 @@ NNFW_STATUS nnfw_session::run()
+   {
+     std::cerr << "Error during nnfw_session::run : "
+               << "run should be run after prepare" << std::endl;
+-    return NNFW_STATUS_ERROR;
++    return NNFW_STATUS_INVALID_STATE;
+   }
+ 
+   try
+@@ -211,7 +211,7 @@ NNFW_STATUS nnfw_session::run_async()
+   {
+     std::cerr << "Error during nnfw_session::run_async : "
+               << "run_async should be run after prepare" << std::endl;
+-    return NNFW_STATUS_ERROR;
++    return NNFW_STATUS_INVALID_STATE;
+   }
+ 
+   _execution->startExecute();
+@@ -241,7 +241,7 @@ NNFW_STATUS nnfw_session::set_input(uint32_t index, NNFW_TYPE /*type*/, const vo
+   if (!isStatePreparedOrFinishedRun())
+   {
+     std::cerr << "Error during nnfw_session::set_input : invalid state" << std::endl;
+-    return NNFW_STATUS_ERROR;
++    return NNFW_STATUS_INVALID_STATE;
+   }
+ 
+   if (!buffer && length != 0)
+@@ -270,7 +270,7 @@ NNFW_STATUS nnfw_session::set_output(uint32_t index, NNFW_TYPE /*type*/, void *b
+   if (!isStatePreparedOrFinishedRun())
+   {
+     std::cerr << "Error during nnfw_session::set_output : invalid state" << std::endl;
+-    return NNFW_STATUS_ERROR;
++    return NNFW_STATUS_INVALID_STATE;
+   }
+ 
+   if (!buffer && length != 0)
+@@ -296,7 +296,7 @@ NNFW_STATUS nnfw_session::set_output(uint32_t index, NNFW_TYPE /*type*/, void *b
+ NNFW_STATUS nnfw_session::input_size(uint32_t *number)
+ {
+   if (isStateInitialized()) // Model is not loaded
+-    return NNFW_STATUS_ERROR;
++    return NNFW_STATUS_INVALID_STATE;
+ 
+   try
+   {
+@@ -318,7 +318,7 @@ NNFW_STATUS nnfw_session::input_size(uint32_t *number)
+ NNFW_STATUS nnfw_session::output_size(uint32_t *number)
+ {
+   if (isStateInitialized()) // Model is not loaded
+-    return NNFW_STATUS_ERROR;
++    return NNFW_STATUS_INVALID_STATE;
+ 
+   try
+   {
+@@ -410,7 +410,7 @@ NNFW_STATUS nnfw_session::apply_tensorinfo(uint32_t index, nnfw_tensorinfo ti)
+     {
+       std::cerr << "Error during set_input_tensorinfo : should be run after load_model"
+                 << std::endl;
+-      return NNFW_STATUS_ERROR;
++      return NNFW_STATUS_INVALID_STATE;
+     }
+ 
+     if (ti.rank <= 0 || ti.rank > NNFW_MAX_RANK)
+@@ -463,6 +463,9 @@ NNFW_STATUS nnfw_session::set_input_tensorinfo(uint32_t index, const nnfw_tensor
+ 
+ NNFW_STATUS nnfw_session::input_tensorinfo(uint32_t index, nnfw_tensorinfo *ti)
+ {
++  if (isStateInitialized())
++    return NNFW_STATUS_INVALID_STATE;
++
+   try
+   {
+     if (ti == nullptr)
+@@ -499,7 +502,7 @@ NNFW_STATUS nnfw_session::input_tensorinfo(uint32_t index, nnfw_tensorinfo *ti)
+ NNFW_STATUS nnfw_session::output_tensorinfo(uint32_t index, nnfw_tensorinfo *ti)
+ {
+   if (isStateInitialized())
+-    return NNFW_STATUS_ERROR;
++    return NNFW_STATUS_INVALID_STATE;
+ 
+   if (ti == nullptr)
+   {
+@@ -570,7 +573,7 @@ static std::string get_op_backend_string(std::string op)
+ NNFW_STATUS nnfw_session::set_available_backends(const char *backends)
+ {
+   if (!isStateModelLoaded())
+-    return NNFW_STATUS_ERROR;
++    return NNFW_STATUS_INVALID_STATE;
+ 
+   try
+   {
+@@ -596,7 +599,7 @@ NNFW_STATUS nnfw_session::set_available_backends(const char *backends)
+ NNFW_STATUS nnfw_session::set_op_backend(const char *op, const char *backend)
+ {
+   if (!isStateModelLoaded())
+-    return NNFW_STATUS_ERROR;
++    return NNFW_STATUS_INVALID_STATE;
+ 
+   try
+   {
+@@ -627,7 +630,7 @@ NNFW_STATUS nnfw_session::set_op_backend(const char *op, const char *backend)
+ NNFW_STATUS nnfw_session::set_config(const char *key, const char *value)
+ {
+   if (!isStateModelLoaded())
+-    return NNFW_STATUS_ERROR;
++    return NNFW_STATUS_INVALID_STATE;
+ 
+   auto &options = _compiler->options();
+ 
+@@ -693,7 +696,7 @@ onert::ir::Graph *nnfw_session::primary_subgraph()
+ NNFW_STATUS nnfw_session::get_config(const char *key, char *value, size_t value_size)
+ {
+   if (!isStateModelLoaded())
+-    return NNFW_STATUS_ERROR;
++    return NNFW_STATUS_INVALID_STATE;
+ 
+   auto &options = _compiler->options();
+ 
+diff --git a/runtime/onert/backend/acl_cl/KernelGenerator.cc b/runtime/onert/backend/acl_cl/KernelGenerator.cc
+index 3ca4058..4ab2d4c 100644
+--- a/runtime/onert/backend/acl_cl/KernelGenerator.cc
++++ b/runtime/onert/backend/acl_cl/KernelGenerator.cc
+@@ -31,6 +31,7 @@
+ #include "exec/FunctionSequence.h"
+ #include "util/logging.h"
+ #include "util/Utils.h"
++#include "AclKernelGen.h"
+ 
+ namespace onert
+ {
+@@ -76,15 +77,15 @@ void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node)
+   const auto block_size_index{
+       node.getInputs().at(ir::operation::BatchToSpaceND::Input::BLOCK_SIZE)};
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+-  auto block_size_alloc = _tensor_builder->at(block_size_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
++  auto block_size_tensor = _tensor_builder->at(block_size_index).get();
+ 
+   assert(_ctx.at(block_size_index).data());
+ 
+   auto fn = std::make_unique<::arm_compute::CLBatchToSpaceLayer>();
+ 
+-  fn->configure(ifm_alloc->handle(), block_size_alloc->handle(), ofm_alloc->handle());
++  fn->configure(ifm_tensor->handle(), block_size_tensor->handle(), ofm_tensor->handle());
+ 
+   auto acl_fn = asAclClFunction(std::move(fn));
+ 
+@@ -96,15 +97,27 @@ void KernelGenerator::visit(const ir::operation::Cast &node)
+   const auto ofm_index{node.getOutputs().at(0)};
+   const auto ifm_index{node.getInputs().at(ir::operation::Cast::Input::INPUT)};
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+-  const auto input_sub_type = _ctx.at(ifm_index).typeInfo().type() == ir::DataType::BOOL8
+-                                  ? arm_compute::SubDataType::BOOL
+-                                  : arm_compute::SubDataType::NONE;
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+ 
+-  auto fn = std::make_unique<::arm_compute::CLCast>();
++  std::unique_ptr<::arm_compute::IFunction> fn;
++  if (ifm_tensor->data_type() == ofm_tensor->data_type())
++  {
++    auto l = std::make_unique<::arm_compute::CLCopy>();
++
++    l->configure(ifm_tensor->handle(), ofm_tensor->handle());
+ 
+-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), input_sub_type);
++    fn = std::move(l);
++  }
++  else
++  {
++    auto l = std::make_unique<::arm_compute::CLCast>();
++
++    // TODO Support converting float to int32 as round down
++    l->configure(ifm_tensor->handle(), ofm_tensor->handle(), arm_compute::ConvertPolicy::SATURATE);
++
++    fn = std::move(l);
++  }
+ 
+   auto acl_fn = asAclClFunction(std::move(fn));
+ 
+@@ -132,10 +145,10 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node)
+                                             ker_width, ker_height);
+   const auto activation = node.param().activation;
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+-  auto ker_alloc = _tensor_builder->at(ker_index).get();
+-  auto bias_alloc = _tensor_builder->at(bias_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
++  auto ker_tensor = _tensor_builder->at(ker_index).get();
++  auto bias_tensor = _tensor_builder->at(bias_index).get();
+ 
+   const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
+   const auto act_info = acl_common::asActivationLayerInfo(activation);
+@@ -143,8 +156,9 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node)
+   auto fn = std::make_unique<::arm_compute::CLConvolutionLayer>(
+       _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
+ 
+-  fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(), ofm_alloc->handle(),
+-                conv_info, ::arm_compute::WeightsInfo(), ::arm_compute::Size2D(1U, 1U), act_info);
++  fn->configure(ifm_tensor->handle(), ker_tensor->handle(), bias_tensor->handle(),
++                ofm_tensor->handle(), conv_info, ::arm_compute::WeightsInfo(),
++                ::arm_compute::Size2D(1U, 1U), act_info);
+ 
+   _return_fn = asAclClFunction(std::move(fn));
+ }
+@@ -171,10 +185,10 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
+   const auto multiplier = node.param().multiplier;
+   const auto activation = node.param().activation;
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+-  auto ker_alloc = _tensor_builder->at(ker_index).get();
+-  auto bias_alloc = _tensor_builder->at(bias_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
++  auto ker_tensor = _tensor_builder->at(ker_index).get();
++  auto bias_tensor = _tensor_builder->at(bias_index).get();
+ 
+   const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
+   const auto act_info = acl_common::asActivationLayerInfo(activation);
+@@ -182,8 +196,8 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
+   {
+     auto fn = std::make_unique<::arm_compute::CLDepthwiseConvolutionLayer>();
+ 
+-    fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(),
+-                  ofm_alloc->handle(), conv_info, multiplier, act_info);
++    fn->configure(ifm_tensor->handle(), ker_tensor->handle(), bias_tensor->handle(),
++                  ofm_tensor->handle(), conv_info, multiplier, act_info);
+ 
+     _return_fn = asAclClFunction(std::move(fn));
+   }
+@@ -217,19 +231,20 @@ void KernelGenerator::visit(const ir::operation::MaxPool2D &node)
+   VERBOSE(MaxPool2D) << "PAD(L): " << padding.left << std::endl;
+   VERBOSE(MaxPool2D) << "PAD(R): " << padding.right << std::endl;
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+ 
+-  ::arm_compute::PoolingLayerInfo info{::arm_compute::PoolingType::MAX,
+-                                       ::arm_compute::Size2D{kw, kh},
+-                                       acl_common::asPadStrideInfo(padding, stride)};
++  ::arm_compute::PoolingLayerInfo info{
++      ::arm_compute::PoolingType::MAX, ::arm_compute::Size2D{kw, kh},
++      ifm_tensor->info()->data_layout(), acl_common::asPadStrideInfo(padding, stride)};
+ 
+   auto fn = std::make_unique<::arm_compute::CLPoolingLayer>();
+ 
+-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info);
++  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), info);
+ 
+   _return_fn = std::make_unique<exec::FunctionSequence>(
+-      asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
++      asAclClFunction(std::move(fn)),
++      ActivationBuilder::generate(activation, ofm_tensor->handle()));
+ }
+ 
+ void KernelGenerator::visit(const ir::operation::AvgPool2D &node)
+@@ -260,19 +275,21 @@ void KernelGenerator::visit(const ir::operation::AvgPool2D &node)
+   VERBOSE(AvgPool2D) << "PAD(L): " << padding.left << std::endl;
+   VERBOSE(AvgPool2D) << "PAD(R): " << padding.right << std::endl;
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+ 
+   ::arm_compute::PoolingLayerInfo info{
+       ::arm_compute::PoolingType::AVG, ::arm_compute::Size2D{kw, kh},
+-      acl_common::asPadStrideInfo(padding, stride), true /* exclude_padding */};
++      ifm_tensor->info()->data_layout(), acl_common::asPadStrideInfo(padding, stride),
++      true /* exclude_padding */};
+ 
+   auto fn = std::make_unique<::arm_compute::CLPoolingLayer>();
+ 
+-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info);
++  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), info);
+ 
+   _return_fn = std::make_unique<exec::FunctionSequence>(
+-      asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
++      asAclClFunction(std::move(fn)),
++      ActivationBuilder::generate(activation, ofm_tensor->handle()));
+ }
+ 
+ void KernelGenerator::visit(const ir::operation::Concat &node)
+@@ -296,7 +313,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
+     return;
+   }
+ 
+-  auto output_alloc = _tensor_builder->at(ofm_index).get();
++  auto output_tensor = _tensor_builder->at(ofm_index).get();
+   std::vector<::arm_compute::ICLTensor *> input_tensors;
+   for (auto &ifm_ind : input_indexes)
+     input_tensors.emplace_back(_tensor_builder->at(ifm_ind)->handle());
+@@ -305,7 +322,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
+   if (input_indexes.size() < 2)
+   {
+     auto l = std::make_unique<::arm_compute::CLCopy>();
+-    l->configure(input_tensors.at(0), output_alloc->handle());
++    l->configure(input_tensors.at(0), output_tensor->handle());
+     fn = std::move(l);
+   }
+   else
+@@ -313,10 +330,10 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
+     auto l = std::make_unique<::arm_compute::CLConcatenateLayer>();
+     const auto rank = _ctx.at(ofm_index).shape().rank();
+     const auto frontend_layout = _current_op_seq_layout;
+-    const auto backend_layout = output_alloc->layout();
++    const auto backend_layout = output_tensor->layout();
+     const auto fixed_axis =
+         acl_common::ToARMComputeAxis(rank, axis, frontend_layout, backend_layout).value();
+-    l->configure(input_tensors, output_alloc->handle(), fixed_axis);
++    l->configure(input_tensors, output_tensor->handle(), fixed_axis);
+     fn = std::move(l);
+   }
+ 
+@@ -327,75 +344,15 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
+ 
+ void KernelGenerator::visit(const ir::operation::FullyConnected &node)
+ {
+-  using ir::operation::FullyConnected;
+-
+   const auto output_index{node.getOutputs().at(0)};
+-  const auto input_index{node.getInputs().at(FullyConnected::Input::INPUT)};
+-  const auto weight_index{node.getInputs().at(FullyConnected::Input::WEIGHT)};
+-  const auto bias_index{node.getInputs().at(FullyConnected::Input::BIAS)};
+-
+-  const auto input_rank = _ctx.at(input_index).shape().rank();
+-
+-  const auto output_size =
+-      _ctx.at(output_index).shape().dim(_ctx.at(output_index).shape().rank() - 1);
+-  UNUSED_RELEASE(output_size);
+-  assert(_ctx.at(bias_index).shape().dim(0) == output_size);
+-  assert(_ctx.at(weight_index).shape().dim(0) == output_size);
+-  const auto batch_size =
+-      _ctx.at(output_index).shape().dim(_ctx.at(output_index).shape().rank() - 2);
+-  const auto input_size =
+-      _ctx.at(weight_index).shape().dim(_ctx.at(weight_index).shape().rank() - 1);
+-
+-  // Check for reshaping input's shape into rank-2
+-  bool needs_reshape = false;
+-  ir::Shape reshape(2);
+-  if (input_rank == 3 || input_rank == 4)
+-  {
+-    const auto &ifm_shape = _ctx.at(input_index).shape();
+-    auto feature_size = 1;
+-    for (int i = 0; i < ifm_shape.rank(); ++i)
+-    {
+-      feature_size *= ifm_shape.dim(i);
+-    }
+-
+-    UNUSED_RELEASE(feature_size);
+-    assert(feature_size == batch_size * input_size);
+-
+-    // for reshaping
+-    needs_reshape = true;
+-    reshape.dim(0) = batch_size; /* H */
+-    reshape.dim(1) = input_size; /* W */
+-  }
+-
++  auto output_tensor = _tensor_builder->at(output_index).get();
+   const auto activation = node.param().activation;
+ 
+-  auto output_alloc = _tensor_builder->at(output_index).get();
+-  const auto input_alloc = _tensor_builder->at(input_index).get();
+-  const auto weight_alloc = _tensor_builder->at(weight_index).get();
+-  const auto bias_alloc = _tensor_builder->at(bias_index).get();
+-  const auto frontend_layout = _current_op_seq_layout;
+-  const auto acl_layout = output_alloc->handle()->info()->data_layout();
+-
+-  auto fn = std::make_unique<arm_compute::CLFullyConnectedReshapingLayer>(
+-      _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
+-
+-  arm_compute::CLFullyConnectedReshapingLayer::KernelType kernel_type =
+-      arm_compute::CLFullyConnectedReshapingLayer::KernelType::GENERAL;
+-  if (_ctx.at(weight_index).isConstant())
+-  {
+-    kernel_type = arm_compute::CLFullyConnectedReshapingLayer::KernelType::PREPROCESSED_WEIGHTS;
+-    assert(_ctx.at(weight_index).data());
+-  }
+-  fn->configure(
+-      input_alloc->handle(), weight_alloc->handle(), bias_alloc->handle(), output_alloc->handle(),
+-      needs_reshape,
+-      ::onert::backend::acl_common::asTensorShape(
+-          reshape, frontend_layout, ::onert::backend::acl_common::asRuntimeLayout(acl_layout)),
+-      kernel_type);
+-
++  auto fn = acl_common::kernelGenFullyConnected<acl_common::AclClFunction, ::arm_compute::ICLTensor,
++                                                ::arm_compute::CLFullyConnectedReshapingLayer>(
++      node, _ctx, _tensor_builder, _current_op_seq_layout);
+   _return_fn = std::make_unique<exec::FunctionSequence>(
+-      asAclClFunction(std::move(fn)),
+-      ActivationBuilder::generate(activation, output_alloc->handle()));
++      std::move(fn), ActivationBuilder::generate(activation, output_tensor->handle()));
+ }
+ 
+ void KernelGenerator::visit(const ir::operation::Mul &node)
+@@ -406,17 +363,18 @@ void KernelGenerator::visit(const ir::operation::Mul &node)
+ 
+   const auto activation = node.param().activation;
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto lhs_alloc = _tensor_builder->at(lhs_index).get();
+-  auto rhs_alloc = _tensor_builder->at(rhs_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto lhs_tensor = _tensor_builder->at(lhs_index).get();
++  auto rhs_tensor = _tensor_builder->at(rhs_index).get();
+ 
+   auto fn = std::make_unique<::arm_compute::CLPixelWiseMultiplication>();
+ 
+-  fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(), 1.0, // scale
++  fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), 1.0, // scale
+                 arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+ 
+   _return_fn = std::make_unique<exec::FunctionSequence>(
+-      asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
++      asAclClFunction(std::move(fn)),
++      ActivationBuilder::generate(activation, ofm_tensor->handle()));
+ }
+ 
+ void KernelGenerator::visit(const ir::operation::Reduce &node)
+@@ -427,14 +385,14 @@ void KernelGenerator::visit(const ir::operation::Reduce &node)
+   const auto keep_dims{node.param().keep_dims};
+   const auto reduce_type = node.param().reduce_type;
+ 
+-  auto output_alloc = _tensor_builder->at(output_index).get();
+-  auto input_alloc = _tensor_builder->at(input_index).get();
++  auto output_tensor = _tensor_builder->at(output_index).get();
++  auto input_tensor = _tensor_builder->at(input_index).get();
+ 
+   // Convert to ACL axes taking into account negative values and possible duplicates.
+   const auto &axes = _ctx.at(axes_index);
+   const auto input_rank = _ctx.at(input_index).shape().rank();
+   const auto frontend_layout = _current_op_seq_layout;
+-  const auto backend_layout = input_alloc->layout();
++  const auto backend_layout = input_tensor->layout();
+ 
+   std::unique_ptr<arm_compute::IFunction> fn;
+   if (reduce_type == ir::operation::Reduce::ReduceType::MEAN)
+@@ -443,7 +401,7 @@ void KernelGenerator::visit(const ir::operation::Reduce &node)
+ 
+     const auto acl_axes =
+         acl_common::asCoordinates(axes, input_rank, frontend_layout, backend_layout);
+-    l->configure(input_alloc->handle(), acl_axes, keep_dims, output_alloc->handle());
++    l->configure(input_tensor->handle(), acl_axes, keep_dims, output_tensor->handle());
+ 
+     fn = std::move(l);
+   }
+@@ -453,7 +411,7 @@ void KernelGenerator::visit(const ir::operation::Reduce &node)
+         _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
+ 
+     const auto acl_axes = acl_common::asSet(axes, input_rank, frontend_layout, backend_layout);
+-    l->configure(input_alloc->handle(), output_alloc->handle(), acl_axes, keep_dims,
++    l->configure(input_tensor->handle(), output_tensor->handle(), acl_axes, keep_dims,
+                  acl_common::convertReduceType(reduce_type));
+ 
+     fn = std::move(l);
+@@ -469,13 +427,13 @@ void KernelGenerator::visit(const ir::operation::Reshape &node)
+   const auto output_index{node.getOutputs().at(0)};
+   const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)};
+ 
+-  auto output_alloc = _tensor_builder->at(output_index).get();
+-  auto input_alloc = _tensor_builder->at(input_index).get();
++  auto output_tensor = _tensor_builder->at(output_index).get();
++  auto input_tensor = _tensor_builder->at(input_index).get();
+ 
+   // NOTE This operation must not be changed the layout from frontend to backend
+   //      So, PermutationOperationPass makes layouts of frontend and backend the same.
+   const auto frontend_layout = _current_op_seq_layout;
+-  const auto backend_layout = output_alloc->layout();
++  const auto backend_layout = output_tensor->layout();
+   assert((_ctx.at(input_index).shape().rank() < 4 && _ctx.at(output_index).shape().rank() < 4) ||
+          frontend_layout == backend_layout);
+   UNUSED_RELEASE(frontend_layout);
+@@ -483,7 +441,7 @@ void KernelGenerator::visit(const ir::operation::Reshape &node)
+ 
+   auto fn = std::make_unique<::arm_compute::CLReshapeLayer>();
+ 
+-  fn->configure(input_alloc->handle(), output_alloc->handle());
++  fn->configure(input_tensor->handle(), output_tensor->handle());
+ 
+   auto acl_fn = asAclClFunction(std::move(fn));
+ 
+@@ -503,10 +461,10 @@ void KernelGenerator::visit(const ir::operation::Squeeze &node)
+   (void)dims;
+   (void)ndim;
+ 
+-  auto output_alloc = _tensor_builder->at(output_index).get();
+-  auto input_alloc = _tensor_builder->at(input_index).get();
++  auto output_tensor = _tensor_builder->at(output_index).get();
++  auto input_tensor = _tensor_builder->at(input_index).get();
+   auto fn = std::make_unique<arm_compute::CLReshapeLayer>();
+-  fn->configure(input_alloc->handle(), output_alloc->handle());
++  fn->configure(input_tensor->handle(), output_tensor->handle());
+   auto acl_fn = asAclClFunction(std::move(fn));
+   _return_fn = std::move(acl_fn);
+ }
+@@ -516,15 +474,15 @@ void KernelGenerator::visit(const ir::operation::Tanh &node)
+   const auto output_index{node.getOutputs().at(0)};
+   const auto input_index{node.getInputs().at(ir::operation::Tanh::Input::INPUT)};
+ 
+-  auto output_alloc = _tensor_builder->at(output_index).get();
+-  auto input_alloc = _tensor_builder->at(input_index).get();
++  auto output_tensor = _tensor_builder->at(output_index).get();
++  auto input_tensor = _tensor_builder->at(input_index).get();
+ 
+   auto fn = std::make_unique<arm_compute::CLActivationLayer>();
+ 
+   const ::arm_compute::ActivationLayerInfo act_info{
+       ::arm_compute::ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f};
+ 
+-  fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
++  fn->configure(input_tensor->handle(), output_tensor->handle(), act_info);
+ 
+   auto acl_fn = asAclClFunction(std::move(fn));
+ 
+@@ -538,13 +496,13 @@ void KernelGenerator::visit(const ir::operation::Softmax &node)
+ 
+   const auto beta = node.param().beta;
+ 
+-  auto output_alloc = _tensor_builder->at(output_index).get();
+-  auto input_alloc = _tensor_builder->at(input_index).get();
++  auto output_tensor = _tensor_builder->at(output_index).get();
++  auto input_tensor = _tensor_builder->at(input_index).get();
+ 
+   auto fn = std::make_unique<::arm_compute::CLSoftmaxLayer>(
+       _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
+ 
+-  fn->configure(input_alloc->handle(), output_alloc->handle(), beta);
++  fn->configure(input_tensor->handle(), output_tensor->handle(), beta);
+ 
+   auto acl_fn = asAclClFunction(std::move(fn));
+ 
+@@ -558,10 +516,10 @@ void KernelGenerator::visit(const ir::operation::Slice &node)
+   const auto begins_index{node.getInputs().at(ir::operation::Slice::Input::BEGINS)};
+   const auto sizes_index{node.getInputs().at(ir::operation::Slice::Input::SIZES)};
+ 
+-  auto outputData_alloc = _tensor_builder->at(output_index).get();
+-  auto inputData_alloc = _tensor_builder->at(input_index).get();
++  auto outputData_tensor = _tensor_builder->at(output_index).get();
++  auto inputData_tensor = _tensor_builder->at(input_index).get();
+   const auto frontend_layout = _current_op_seq_layout;
+-  const auto backend_layout = inputData_alloc->layout();
++  const auto backend_layout = inputData_tensor->layout();
+ 
+   // Set initializers for indices data such as order of inputData
+   int input_rank = _ctx.at(input_index).shape().rank();
+@@ -613,7 +571,7 @@ void KernelGenerator::visit(const ir::operation::Slice &node)
+ 
+   auto fn = std::make_unique<::arm_compute::CLSlice>();
+ 
+-  fn->configure(inputData_alloc->handle(), outputData_alloc->handle(), starts_set, ends_set);
++  fn->configure(inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set);
+ 
+   auto acl_fn = asAclClFunction(std::move(fn));
+ 
+@@ -628,10 +586,10 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node)
+   const auto ends_index{node.getInputs().at(ir::operation::StridedSlice::Input::ENDS)};
+   const auto strides_index{node.getInputs().at(ir::operation::StridedSlice::Input::STRIDES)};
+ 
+-  auto outputData_alloc = _tensor_builder->at(output_index).get();
+-  auto inputData_alloc = _tensor_builder->at(input_index).get();
++  auto outputData_tensor = _tensor_builder->at(output_index).get();
++  auto inputData_tensor = _tensor_builder->at(input_index).get();
+   const auto frontend_layout = _current_op_seq_layout;
+-  const auto backend_layout = inputData_alloc->layout();
++  const auto backend_layout = inputData_tensor->layout();
+ 
+   // Set initializers for indices data such as order of inputData
+   int input_rank = _ctx.at(input_index).shape().rank();
+@@ -704,7 +662,7 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node)
+ 
+   auto fn = std::make_unique<::arm_compute::CLStridedSlice>();
+ 
+-  fn->configure(inputData_alloc->handle(), outputData_alloc->handle(), starts_set, ends_set,
++  fn->configure(inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set,
+                 strides_set, begin_mask, end_mask, shrink_axis_mask);
+ 
+   auto acl_fn = asAclClFunction(std::move(fn));
+@@ -720,10 +678,10 @@ void KernelGenerator::visit(const ir::operation::Transpose &node)
+ 
+   const auto rank = _ctx.at(ifm_idx).shape().rank();
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_idx).get();
+-  auto ifm_alloc = _tensor_builder->at(ifm_idx).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_idx).get();
++  auto ifm_tensor = _tensor_builder->at(ifm_idx).get();
+   const auto frontend_layout = _current_op_seq_layout;
+-  const auto backend_layout = ifm_alloc->layout();
++  const auto backend_layout = ifm_tensor->layout();
+ 
+   std::vector<std::int32_t> pv(perm.cbegin(), perm.cend());
+   // Reversed
+@@ -732,7 +690,7 @@ void KernelGenerator::visit(const ir::operation::Transpose &node)
+ 
+   auto fn = std::make_unique<::arm_compute::CLPermute>();
+ 
+-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), backend_pv);
++  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), backend_pv);
+ 
+   auto acl_fn = asAclClFunction(std::move(fn));
+ 
+@@ -747,17 +705,18 @@ void KernelGenerator::visit(const ir::operation::Add &node)
+ 
+   const auto activation = node.param().activation;
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto lhs_alloc = _tensor_builder->at(lhs_index).get();
+-  auto rhs_alloc = _tensor_builder->at(rhs_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto lhs_tensor = _tensor_builder->at(lhs_index).get();
++  auto rhs_tensor = _tensor_builder->at(rhs_index).get();
+ 
+   auto fn = std::make_unique<::arm_compute::CLArithmeticAddition>();
+ 
+-  fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(),
++  fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
+                 arm_compute::ConvertPolicy::SATURATE);
+ 
+   _return_fn = std::make_unique<exec::FunctionSequence>(
+-      asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
++      asAclClFunction(std::move(fn)),
++      ActivationBuilder::generate(activation, ofm_tensor->handle()));
+ }
+ 
+ void KernelGenerator::visit(const ir::operation::Sub &node)
+@@ -768,17 +727,18 @@ void KernelGenerator::visit(const ir::operation::Sub &node)
+ 
+   const auto activation = node.param().activation;
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto lhs_alloc = _tensor_builder->at(lhs_index).get();
+-  auto rhs_alloc = _tensor_builder->at(rhs_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto lhs_tensor = _tensor_builder->at(lhs_index).get();
++  auto rhs_tensor = _tensor_builder->at(rhs_index).get();
+ 
+   auto fn = std::make_unique<::arm_compute::CLArithmeticSubtraction>();
+ 
+-  fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(),
++  fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
+                 arm_compute::ConvertPolicy::SATURATE);
+ 
+   _return_fn = std::make_unique<exec::FunctionSequence>(
+-      asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
++      asAclClFunction(std::move(fn)),
++      ActivationBuilder::generate(activation, ofm_tensor->handle()));
+ }
+ 
+ void KernelGenerator::visit(const ir::operation::Div &node)
+@@ -789,16 +749,17 @@ void KernelGenerator::visit(const ir::operation::Div &node)
+ 
+   const auto activation = node.param().activation;
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto lhs_alloc = _tensor_builder->at(lhs_index).get();
+-  auto rhs_alloc = _tensor_builder->at(rhs_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto lhs_tensor = _tensor_builder->at(lhs_index).get();
++  auto rhs_tensor = _tensor_builder->at(rhs_index).get();
+ 
+   auto fn = std::make_unique<::arm_compute::CLArithmeticDivision>();
+ 
+-  fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
++  fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
+ 
+   _return_fn = std::make_unique<exec::FunctionSequence>(
+-      asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
++      asAclClFunction(std::move(fn)),
++      ActivationBuilder::generate(activation, ofm_tensor->handle()));
+ }
+ 
+ void KernelGenerator::visit(const ir::operation::Exp &node)
+@@ -806,12 +767,12 @@ void KernelGenerator::visit(const ir::operation::Exp &node)
+   const auto output_index{node.getOutputs().at(0)};
+   const auto input_index{node.getInputs().at(ir::operation::Exp::Input::INPUT)};
+ 
+-  auto output_alloc = _tensor_builder->at(output_index).get();
+-  auto input_alloc = _tensor_builder->at(input_index).get();
++  auto output_tensor = _tensor_builder->at(output_index).get();
++  auto input_tensor = _tensor_builder->at(input_index).get();
+ 
+   auto fn = std::make_unique<::arm_compute::CLExpLayer>();
+ 
+-  fn->configure(input_alloc->handle(), output_alloc->handle());
++  fn->configure(input_tensor->handle(), output_tensor->handle());
+ 
+   auto acl_fn = asAclClFunction(std::move(fn));
+ 
+@@ -823,12 +784,12 @@ void KernelGenerator::visit(const ir::operation::ExpandDims &node)
+   const auto output_index{node.getOutputs().at(0)};
+   const auto input_index{node.getInputs().at(ir::operation::ExpandDims::Input::INPUT)};
+ 
+-  auto output_alloc = _tensor_builder->at(output_index).get();
+-  auto input_alloc = _tensor_builder->at(input_index).get();
++  auto output_tensor = _tensor_builder->at(output_index).get();
++  auto input_tensor = _tensor_builder->at(input_index).get();
+ 
+   auto fn = std::make_unique<::arm_compute::CLReshapeLayer>();
+ 
+-  fn->configure(input_alloc->handle(), output_alloc->handle());
++  fn->configure(input_tensor->handle(), output_tensor->handle());
+ 
+   auto acl_fn = asAclClFunction(std::move(fn));
+ 
+@@ -842,20 +803,21 @@ void KernelGenerator::visit(const ir::operation::InstanceNorm &node)
+   const auto gamma_index{node.getInputs().at(ir::operation::InstanceNorm::Input::GAMMA)};
+   const auto beta_index{node.getInputs().at(ir::operation::InstanceNorm::Input::BETA)};
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+-  auto gamma_alloc = _tensor_builder->at(gamma_index).get();
+-  auto beta_alloc = _tensor_builder->at(beta_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
++  auto gamma_tensor = _tensor_builder->at(gamma_index).get();
++  auto beta_tensor = _tensor_builder->at(beta_index).get();
+   auto epsilon = node.param().epsilon;
+   auto activation = node.param().activation;
+ 
+   auto fn = std::make_unique<::arm_compute::CLInstanceNormalizationLayerEx>();
+ 
+-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), gamma_alloc->handle(),
+-                beta_alloc->handle(), epsilon);
++  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), gamma_tensor->handle(),
++                beta_tensor->handle(), epsilon);
+ 
+   _return_fn = std::make_unique<exec::FunctionSequence>(
+-      asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
++      asAclClFunction(std::move(fn)),
++      ActivationBuilder::generate(activation, ofm_tensor->handle()));
+ }
+ 
+ void KernelGenerator::visit(const ir::operation::Logistic &node)
+@@ -863,15 +825,15 @@ void KernelGenerator::visit(const ir::operation::Logistic &node)
+   const auto ofm_index{node.getOutputs().at(0)};
+   const auto ifm_index{node.getInputs().at(ir::operation::Logistic::Input::INPUT)};
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+ 
+   const ::arm_compute::ActivationLayerInfo act_info{
+       ::arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC};
+ 
+   auto fn = std::make_unique<::arm_compute::CLActivationLayer>();
+ 
+-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info);
++  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), act_info);
+ 
+   auto acl_fn = asAclClFunction(std::move(fn));
+ 
+@@ -884,13 +846,13 @@ void KernelGenerator::visit(const ir::operation::LogicalAnd &node)
+   const auto input0_index{node.getInputs().at(ir::operation::LogicalAnd::Input::INPUT0)};
+   const auto input1_index{node.getInputs().at(ir::operation::LogicalAnd::Input::INPUT1)};
+ 
+-  auto output_alloc = _tensor_builder->at(output_index).get();
+-  auto input0_alloc = _tensor_builder->at(input0_index).get();
+-  auto input1_alloc = _tensor_builder->at(input1_index).get();
++  auto output_tensor = _tensor_builder->at(output_index).get();
++  auto input0_tensor = _tensor_builder->at(input0_index).get();
++  auto input1_tensor = _tensor_builder->at(input1_index).get();
+ 
+   auto fn = std::make_unique<::arm_compute::CLBinaryLogicalOp>();
+ 
+-  fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle(),
++  fn->configure(input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle(),
+                 ::arm_compute::BinaryLogicalOperation::AND);
+ 
+   auto acl_fn = asAclClFunction(std::move(fn));
+@@ -900,159 +862,8 @@ void KernelGenerator::visit(const ir::operation::LogicalAnd &node)
+ 
+ void KernelGenerator::visit(const ir::operation::LSTM &node)
+ {
+-  // TODO Support dynamic rnn
+-  // TODO Fix subtle error in the case of non-CIFG, non-peephole and No Projection.
+-  const auto scratch_buffer_index{
+-      node.getOutputs().at(ir::operation::LSTM::Output::SCRATCH_BUFFER)};
+-  const auto output_state_out_index{
+-      node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT_STATE_OUT)};
+-  const auto cell_state_out_index{
+-      node.getOutputs().at(ir::operation::LSTM::Output::CELL_STATE_OUT)};
+-  const auto output_index{node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT)};
+-
+-  const auto input_index{node.getInputs().at(ir::operation::LSTM::Input::INPUT)};
+-  const auto input_to_input_weights_index{
+-      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_INPUT_WEIGHTS)}; // optional
+-  const auto input_to_forget_weights_index{
+-      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_FORGET_WEIGHTS)};
+-  const auto input_to_cell_weights_index{
+-      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_CELL_WEIGHTS)};
+-  const auto input_to_output_weights_index{
+-      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_OUTPUT_WEIGHTS)};
+-  const auto recurrent_to_input_weights_index{
+-      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_INPUT_WEIGHTS)}; // optional
+-  const auto recurrent_to_forget_weights_index{
+-      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_FORGET_WEIGHTS)};
+-  const auto recurrent_to_cell_weights_index{
+-      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_CELL_WEIGHTS)};
+-  const auto recurrent_to_output_weights_index{
+-      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_OUTPUT_WEIGHTS)};
+-  const auto cell_to_input_weights_index{
+-      node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_INPUT_WEIGHTS)}; // optional
+-  const auto cell_to_forget_weights_index{
+-      node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_FORGET_WEIGHTS)}; // optional
+-  const auto cell_to_output_weights_index{
+-      node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_OUTPUT_WEIGHTS)}; // optional
+-  const auto input_gate_bias_index{
+-      node.getInputs().at(ir::operation::LSTM::Input::INPUT_GATE_BIAS)};
+-  const auto forget_gate_bias_index{
+-      node.getInputs().at(ir::operation::LSTM::Input::FORGET_GATE_BIAS)};
+-  const auto cell_bias_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_BIAS)};
+-  const auto output_gate_bias_index{
+-      node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_GATE_BIAS)};
+-  const auto projection_weights_index{
+-      node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_WEIGHTS)}; // optional
+-  const auto projection_bias_index{
+-      node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_BIAS)}; // optional
+-  const auto output_state_in_index{
+-      node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_STATE_IN)};
+-  const auto cell_state_in_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_STATE_IN)};
+-  const auto cell_threshold = node.param().cell_threshold;
+-  const auto projection_threshold = node.param().projection_threshold;
+-
+-  bool has_input_to_input_weights = _ctx.at(input_to_input_weights_index).shape().dim(0) != 0 &&
+-                                    _ctx.at(input_to_input_weights_index).shape().dim(1) != 0;
+-  bool has_recurrent_to_input_weights =
+-      _ctx.at(recurrent_to_input_weights_index).shape().dim(0) != 0 &&
+-      _ctx.at(recurrent_to_input_weights_index).shape().dim(1) != 0;
+-  bool has_cell_to_forget_weights = _ctx.at(cell_to_forget_weights_index).shape().dim(0) != 0;
+-  bool has_cell_to_output_weights = _ctx.at(cell_to_output_weights_index).shape().dim(0) != 0;
+-  bool has_projection_weights = _ctx.at(projection_weights_index).shape().dim(0) != 0 &&
+-                                _ctx.at(projection_weights_index).shape().dim(1) != 0;
+-  bool has_projection_bias = _ctx.at(projection_bias_index).shape().dim(0);
+-
+-  // NOTE The input_to_input_weights and the recurrent_to_input_weights do not exist in CIFG.
+-  // true: no CIFG
+-  // false: CIFG
+-  // NOTE The cell_to_input_weights does not exist in non-peephole although regular LSTM(non-CIFG).
+-  bool has_cifg_param = has_input_to_input_weights && has_recurrent_to_input_weights;
+-
+-  // NOTE The cell_to_forget_weights and the cell_to_output_weights exist in peephole.
+-  // But the cell_to_input_weights does not exist in regular CIFG although peephole.
+-  // true: peephole
+-  // false: no peephole
+-  bool has_peephole_param = has_cell_to_forget_weights && has_cell_to_output_weights;
+-
+-  // NOTE Although the projection weights has data the projection bias may not have data.
+-  bool has_projection_param = has_projection_weights;
+-
+-  const auto activation = node.param().activation;
+-  const auto cell_clip = cell_threshold;
+-  const auto projection_clip = projection_threshold;
+-  assert(cell_clip >= 0.f && projection_clip >= 0.f);
+-
+-  auto scratch_buffer_alloc = _tensor_builder->at(scratch_buffer_index).get();
+-  auto output_state_out_alloc = _tensor_builder->at(output_state_out_index).get();
+-  auto cell_state_out_alloc = _tensor_builder->at(cell_state_out_index).get();
+-  auto output_alloc = _tensor_builder->at(output_index).get();
+-
+-  auto input_alloc = _tensor_builder->at(input_index).get();
+-
+-  auto input_to_forget_weights_alloc = _tensor_builder->at(input_to_forget_weights_index).get();
+-  auto input_to_cell_weights_alloc = _tensor_builder->at(input_to_cell_weights_index).get();
+-  auto input_to_output_weights_alloc = _tensor_builder->at(input_to_output_weights_index).get();
+-  auto recurrent_to_forget_weights_alloc =
+-      _tensor_builder->at(recurrent_to_forget_weights_index).get();
+-  auto recurrent_to_cell_weights_alloc = _tensor_builder->at(recurrent_to_cell_weights_index).get();
+-  auto recurrent_to_output_weights_alloc =
+-      _tensor_builder->at(recurrent_to_output_weights_index).get();
+-
+-  auto forget_gate_bias_alloc = _tensor_builder->at(forget_gate_bias_index).get();
+-  auto cell_bias_alloc = _tensor_builder->at(cell_bias_index).get();
+-  auto output_gate_bias_alloc = _tensor_builder->at(output_gate_bias_index).get();
+-  auto output_state_in_alloc = _tensor_builder->at(output_state_in_index).get();
+-  auto cell_state_in_alloc = _tensor_builder->at(cell_state_in_index).get();
+-
+-  auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation);
+-
+-  auto fn = std::make_unique<::arm_compute::CLLSTMLayer>();
+-
+-  ::arm_compute::LSTMParams<::arm_compute::ICLTensor> lstm_params{};
+-  if (has_cifg_param)
+-  {
+-    auto input_to_input_weights_alloc =
+-        _tensor_builder->at(input_to_input_weights_index).get(); // optional
+-    auto recurrent_to_input_weights_alloc =
+-        _tensor_builder->at(recurrent_to_input_weights_index).get(); // optional
+-    auto cell_to_input_weights_handle =
+-        has_peephole_param ? _tensor_builder->at(cell_to_input_weights_index).get()->handle()
+-                           : nullptr; // optional (non-cifg && peephole)
+-    auto input_gate_bias_alloc = _tensor_builder->at(input_gate_bias_index).get(); // optional
+-    lstm_params.set_cifg_params(input_to_input_weights_alloc->handle(),
+-                                recurrent_to_input_weights_alloc->handle(),
+-                                cell_to_input_weights_handle, input_gate_bias_alloc->handle());
+-  }
+-  if (has_peephole_param)
+-  {
+-    auto cell_to_forget_weights_alloc =
+-        _tensor_builder->at(cell_to_forget_weights_index).get(); // optional
+-    auto cell_to_output_weights_alloc =
+-        _tensor_builder->at(cell_to_output_weights_index).get(); // optional
+-    lstm_params.set_peephole_params(cell_to_forget_weights_alloc->handle(),
+-                                    cell_to_output_weights_alloc->handle());
+-  }
+-  if (has_projection_param)
+-  {
+-    auto projection_weights_alloc = _tensor_builder->at(projection_weights_index).get(); // optional
+-    auto projection_bias_handle = has_projection_bias
+-                                      ? _tensor_builder->at(projection_bias_index).get()->handle()
+-                                      : nullptr; // optional
+-    lstm_params.set_projection_params(projection_weights_alloc->handle(), projection_bias_handle);
+-  }
+-
+-  fn->configure(
+-      input_alloc->handle(), input_to_forget_weights_alloc->handle(),
+-      input_to_cell_weights_alloc->handle(), input_to_output_weights_alloc->handle(),
+-      recurrent_to_forget_weights_alloc->handle(), recurrent_to_cell_weights_alloc->handle(),
+-      recurrent_to_output_weights_alloc->handle(), forget_gate_bias_alloc->handle(),
+-      cell_bias_alloc->handle(), output_gate_bias_alloc->handle(), output_state_in_alloc->handle(),
+-      cell_state_in_alloc->handle(), scratch_buffer_alloc->handle(),
+-      output_state_out_alloc->handle(), cell_state_out_alloc->handle(), output_alloc->handle(),
+-      lstm_params, act_info, cell_clip, projection_clip);
+-
+-  auto acl_fn = asAclClFunction(std::move(fn));
+-
+-  _return_fn = std::move(acl_fn);
++  _return_fn = acl_common::kernelGenLSTM<acl_common::AclClFunction, ::arm_compute::ICLTensor,
++                                         ::arm_compute::CLLSTMLayer>(node, _ctx, _tensor_builder);
+ }
+ 
+ void KernelGenerator::visit(const ir::operation::Comparison &node)
+@@ -1063,13 +874,13 @@ void KernelGenerator::visit(const ir::operation::Comparison &node)
+ 
+   const auto comparison_type = node.param().comparison_type;
+ 
+-  auto output_alloc = _tensor_builder->at(output_index).get();
+-  auto input0_alloc = _tensor_builder->at(input0_index).get();
+-  auto input1_alloc = _tensor_builder->at(input1_index).get();
++  auto output_tensor = _tensor_builder->at(output_index).get();
++  auto input0_tensor = _tensor_builder->at(input0_index).get();
++  auto input1_tensor = _tensor_builder->at(input1_index).get();
+ 
+   auto fn = std::make_unique<::arm_compute::CLComparison>();
+ 
+-  fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle(),
++  fn->configure(input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle(),
+                 (arm_compute::ComparisonOperation)comparison_type);
+ 
+   auto acl_fn = asAclClFunction(std::move(fn));
+@@ -1107,13 +918,13 @@ void KernelGenerator::visit(const ir::operation::Pack &node)
+   for (const auto &input_index : input_indexes)
+   {
+     size_t input_rank = _ctx.at(input_index).shape().rank();
+-    const auto &input_alloc = _tensor_builder->at(input_index);
+-    orig_inputs_acl_tensor_shapes.emplace_back(input_alloc->info()->tensor_shape());
+-    assert(input_rank == input_alloc->num_dimensions());
+-    if (input_rank != input_alloc->info()->num_dimensions())
++    const auto &input_tensor = _tensor_builder->at(input_index);
++    orig_inputs_acl_tensor_shapes.emplace_back(input_tensor->info()->tensor_shape());
++    assert(input_rank == input_tensor->num_dimensions());
++    if (input_rank != input_tensor->info()->num_dimensions())
+     {
+       // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
+-      input_alloc->info()->set_tensor_shape(acl_common::asTensorShape(
++      input_tensor->info()->set_tensor_shape(acl_common::asTensorShape(
+           _ctx.at(input_index).shape(), _current_op_seq_layout, backend_layout, false));
+     }
+   }
+@@ -1135,8 +946,8 @@ void KernelGenerator::visit(const ir::operation::Permute &node)
+   const auto ofm_idx{node.getOutputs().at(0)};
+   const auto ifm_idx{node.getInputs().at(0)};
+   const auto permute_type = node.getPermuteType();
+-  auto ofm_alloc = _tensor_builder->at(ofm_idx).get();
+-  auto ifm_alloc = _tensor_builder->at(ifm_idx).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_idx).get();
++  auto ifm_tensor = _tensor_builder->at(ifm_idx).get();
+   const auto rank = _ctx.at(ofm_idx).shape().rank();
+   assert(_ctx.at(ifm_idx).shape().rank() == _ctx.at(ofm_idx).shape().rank());
+ 
+@@ -1149,7 +960,7 @@ void KernelGenerator::visit(const ir::operation::Permute &node)
+ 
+     auto l = std::make_unique<::arm_compute::CLPermute>();
+ 
+-    l->configure(ifm_alloc->handle(), ofm_alloc->handle(), pv);
++    l->configure(ifm_tensor->handle(), ofm_tensor->handle(), pv);
+ 
+     fn = std::move(l);
+   }
+@@ -1160,7 +971,7 @@ void KernelGenerator::visit(const ir::operation::Permute &node)
+ 
+     auto l = std::make_unique<::arm_compute::CLPermute>();
+ 
+-    l->configure(ifm_alloc->handle(), ofm_alloc->handle(), pv);
++    l->configure(ifm_tensor->handle(), ofm_tensor->handle(), pv);
+ 
+     fn = std::move(l);
+   }
+@@ -1168,7 +979,7 @@ void KernelGenerator::visit(const ir::operation::Permute &node)
+   {
+     auto l = std::make_unique<::arm_compute::CLCopy>();
+ 
+-    l->configure(ifm_alloc->handle(), ofm_alloc->handle());
++    l->configure(ifm_tensor->handle(), ofm_tensor->handle());
+ 
+     fn = std::move(l);
+   }
+@@ -1183,12 +994,12 @@ void KernelGenerator::visit(const ir::operation::RSQRT &node)
+   const auto ofm_index{node.getOutputs().at(0)};
+   const auto ifm_index{node.getInputs().at(ir::operation::RSQRT::Input::INPUT)};
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+ 
+   auto fn = std::make_unique<::arm_compute::CLRsqrtLayer>();
+ 
+-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle());
++  fn->configure(ifm_tensor->handle(), ofm_tensor->handle());
+ 
+   _return_fn = asAclClFunction(std::move(fn));
+ }
+@@ -1198,15 +1009,15 @@ void KernelGenerator::visit(const ir::operation::ReLU &node)
+   const auto output_index{node.getOutputs().at(0)};
+   const auto input_index{node.getInputs().at(ir::operation::ReLU::Input::INPUT)};
+ 
+-  auto output_alloc = _tensor_builder->at(output_index).get();
+-  auto input_alloc = _tensor_builder->at(input_index).get();
++  auto output_tensor = _tensor_builder->at(output_index).get();
++  auto input_tensor = _tensor_builder->at(input_index).get();
+ 
+   auto fn = std::make_unique<arm_compute::CLActivationLayer>();
+ 
+   const ::arm_compute::ActivationLayerInfo act_info{
+       ::arm_compute::ActivationLayerInfo::ActivationFunction::RELU};
+ 
+-  fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
++  fn->configure(input_tensor->handle(), output_tensor->handle(), act_info);
+ 
+   auto acl_fn = asAclClFunction(std::move(fn));
+ 
+@@ -1219,12 +1030,12 @@ void KernelGenerator::visit(const ir::operation::ResizeBilinear &node)
+ 
+   const auto ifm_index{node.getInputs().at(ir::operation::ResizeBilinear::Input::INPUT)};
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+ 
+   auto fn = std::make_unique<::arm_compute::CLScale>();
+ 
+-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(),
++  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(),
+                 ::arm_compute::InterpolationPolicy::BILINEAR, ::arm_compute::BorderMode::REPLICATE,
+                 ::arm_compute::PixelValue(0.f), ::arm_compute::SamplingPolicy::TOP_LEFT);
+ 
+@@ -1238,15 +1049,15 @@ void KernelGenerator::visit(const ir::operation::ReLU1 &node)
+   const auto ofm_index{node.getOutputs().at(0)};
+   const auto ifm_index{node.getInputs().at(ir::operation::ReLU1::Input::INPUT)};
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+ 
+   const ::arm_compute::ActivationLayerInfo act_info{
+       ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1.0f, -1.0f};
+ 
+   auto fn = std::make_unique<::arm_compute::CLActivationLayer>();
+ 
+-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info);
++  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), act_info);
+ 
+   auto acl_fn = asAclClFunction(std::move(fn));
+ 
+@@ -1258,15 +1069,15 @@ void KernelGenerator::visit(const ir::operation::ReLU6 &node)
+   const auto ofm_index{node.getOutputs().at(0)};
+   const auto ifm_index{node.getInputs().at(ir::operation::ReLU6::Input::INPUT)};
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+ 
+   const ::arm_compute::ActivationLayerInfo act_info{
+       ::arm_compute::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.0f};
+ 
+   auto fn = std::make_unique<::arm_compute::CLActivationLayer>();
+ 
+-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info);
++  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), act_info);
+ 
+   auto acl_fn = asAclClFunction(std::move(fn));
+ 
+@@ -1288,25 +1099,25 @@ void KernelGenerator::visit(const ir::operation::RNN &node)
+ 
+   const auto activation = node.param().activation;
+ 
+-  auto output_alloc = _tensor_builder->at(output_index).get();
+-  auto hidden_state_out_alloc = _tensor_builder->at(hidden_state_out_index).get();
++  auto output_tensor = _tensor_builder->at(output_index).get();
++  auto hidden_state_out_tensor = _tensor_builder->at(hidden_state_out_index).get();
+ 
+-  auto input_alloc = _tensor_builder->at(input_index).get();
+-  auto weights_alloc = _tensor_builder->at(weights_index).get();
+-  auto recurrent_weights_alloc = _tensor_builder->at(recurrent_weights_index).get();
+-  auto bias_alloc = _tensor_builder->at(bias_index).get();
+-  auto hidden_state_in_alloc = _tensor_builder->at(hidden_state_in_index).get();
++  auto input_tensor = _tensor_builder->at(input_index).get();
++  auto weights_tensor = _tensor_builder->at(weights_index).get();
++  auto recurrent_weights_tensor = _tensor_builder->at(recurrent_weights_index).get();
++  auto bias_tensor = _tensor_builder->at(bias_index).get();
++  auto hidden_state_in_tensor = _tensor_builder->at(hidden_state_in_index).get();
+   auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation);
+ 
+   auto copy_layer = std::make_unique<::arm_compute::CLCopy>();
+-  copy_layer->configure(hidden_state_in_alloc->handle(), hidden_state_out_alloc->handle());
++  copy_layer->configure(hidden_state_in_tensor->handle(), hidden_state_out_tensor->handle());
+   _return_fn = asAclClFunction(std::move(copy_layer));
+ 
+-  auto fn = std::make_unique<::arm_compute::CLRNNLayerEx>(
++  auto fn = std::make_unique<::arm_compute::CLRNNLayer>(
+       _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
+-  fn->configure(input_alloc->handle(), weights_alloc->handle(), recurrent_weights_alloc->handle(),
+-                bias_alloc->handle(), hidden_state_out_alloc->handle(), output_alloc->handle(),
+-                act_info);
++  fn->configure(input_tensor->handle(), weights_tensor->handle(),
++                recurrent_weights_tensor->handle(), bias_tensor->handle(),
++                hidden_state_out_tensor->handle(), output_tensor->handle(), act_info);
+   _return_fn = asAclClFunction(std::move(fn));
+ }
+ 
+@@ -1315,12 +1126,12 @@ void KernelGenerator::visit(const ir::operation::Floor &node)
+   const auto ofm_index{node.getOutputs().at(0)};
+   const auto ifm_index{node.getInputs().at(ir::operation::Floor::Input::INPUT)};
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+ 
+   auto fn = std::make_unique<::arm_compute::CLFloor>();
+ 
+-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle());
++  fn->configure(ifm_tensor->handle(), ofm_tensor->handle());
+ 
+   auto acl_fn = asAclClFunction(std::move(fn));
+ 
+@@ -1335,10 +1146,10 @@ void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node)
+       node.getInputs().at(ir::operation::SpaceToBatchND::Input::BLOCK_SIZE)};
+   const auto paddings_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::PADDINGS)};
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+-  auto block_size_alloc = _tensor_builder->at(block_size_index).get();
+-  auto paddings_alloc = _tensor_builder->at(paddings_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
++  auto block_size_tensor = _tensor_builder->at(block_size_index).get();
++  auto paddings_tensor = _tensor_builder->at(paddings_index).get();
+ 
+   assert(_ctx.at(block_size_index).data());
+   assert(_ctx.at(paddings_index).data());
+@@ -1346,8 +1157,8 @@ void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node)
+   std::unique_ptr<::arm_compute::IFunction> fn;
+ 
+   auto l = std::make_unique<::arm_compute::CLSpaceToBatchLayer>();
+-  l->configure(ifm_alloc->handle(), block_size_alloc->handle(), paddings_alloc->handle(),
+-               ofm_alloc->handle());
++  l->configure(ifm_tensor->handle(), block_size_tensor->handle(), paddings_tensor->handle(),
++               ofm_tensor->handle());
+   fn = std::move(l);
+ 
+   auto acl_fn = asAclClFunction(std::move(fn));
+@@ -1362,12 +1173,12 @@ void KernelGenerator::visit(const ir::operation::SpaceToDepth &node)
+ 
+   auto block_size = node.param().block_size;
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+ 
+-  auto fn = std::make_unique<::arm_compute::CLSpaceToDepth>();
++  auto fn = std::make_unique<::arm_compute::CLSpaceToDepthLayer>();
+ 
+-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), block_size);
++  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), block_size);
+ 
+   auto acl_fn = asAclClFunction(std::move(fn));
+ 
+@@ -1389,19 +1200,21 @@ void KernelGenerator::visit(const ir::operation::L2Pool2D &node)
+       ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh);
+   const auto activation = node.param().activation;
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+ 
+   ::arm_compute::PoolingLayerInfo info{
+       ::arm_compute::PoolingType::L2, ::arm_compute::Size2D{kw, kh},
++      ifm_tensor->info()->data_layout(),
+       ::onert::backend::acl_common::asPadStrideInfo(padding, stride)};
+ 
+   auto fn = std::make_unique<::arm_compute::CLPoolingLayer>();
+ 
+-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info);
++  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), info);
+ 
+   _return_fn = std::make_unique<exec::FunctionSequence>(
+-      asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
++      asAclClFunction(std::move(fn)),
++      ActivationBuilder::generate(activation, ofm_tensor->handle()));
+ }
+ 
+ void KernelGenerator::visit(const ir::operation::EmbeddingLookup &node)
+@@ -1410,13 +1223,13 @@ void KernelGenerator::visit(const ir::operation::EmbeddingLookup &node)
+   const auto lookups_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::LOOKUPS)};
+   const auto values_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::VALUES)};
+ 
+-  auto output_alloc = _tensor_builder->at(output_index).get();
+-  auto lookups_alloc = _tensor_builder->at(lookups_index).get();
+-  auto values_alloc = _tensor_builder->at(values_index).get();
++  auto output_tensor = _tensor_builder->at(output_index).get();
++  auto lookups_tensor = _tensor_builder->at(lookups_index).get();
++  auto values_tensor = _tensor_builder->at(values_index).get();
+ 
+   auto fn = std::make_unique<::arm_compute::CLEmbeddingLookup>();
+ 
+-  fn->configure(values_alloc->handle(), output_alloc->handle(), lookups_alloc->handle());
++  fn->configure(values_tensor->handle(), output_tensor->handle(), lookups_tensor->handle());
+ 
+   auto acl_fn = asAclClFunction(std::move(fn));
+ 
+@@ -1442,15 +1255,15 @@ void KernelGenerator::visit(const ir::operation::L2Normalization &node)
+   float beta = 0.5f;                             // pow(reduction, -0.5) = 1 / sqrt(reduction)
+   float bias = 0.0f;                             // Don't offset the reduction.
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+ 
+   const auto norm_info = ::arm_compute::NormalizationLayerInfo(::arm_compute::NormType::CROSS_MAP,
+                                                                radius, alpha, beta, bias, false);
+ 
+   auto fn = std::make_unique<::arm_compute::CLNormalizationLayer>();
+ 
+-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), norm_info);
++  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
+ 
+   auto acl_fn = asAclClFunction(std::move(fn));
+ 
+@@ -1466,17 +1279,17 @@ void KernelGenerator::visit(const ir::operation::HashtableLookup &node)
+   const auto keys_index{node.getInputs().at(ir::operation::HashtableLookup::Input::KEYS)};
+   const auto values_index{node.getInputs().at(ir::operation::HashtableLookup::Input::VALUES)};
+ 
+-  auto output_alloc = _tensor_builder->at(output_index).get();
+-  auto hits_alloc = _tensor_builder->at(hits_index).get();
++  auto output_tensor = _tensor_builder->at(output_index).get();
++  auto hits_tensor = _tensor_builder->at(hits_index).get();
+ 
+-  auto lookups_alloc = _tensor_builder->at(lookups_index).get();
+-  auto keys_alloc = _tensor_builder->at(keys_index).get();
+-  auto values_alloc = _tensor_builder->at(values_index).get();
++  auto lookups_tensor = _tensor_builder->at(lookups_index).get();
++  auto keys_tensor = _tensor_builder->at(keys_index).get();
++  auto values_tensor = _tensor_builder->at(values_index).get();
+ 
+   auto fn = std::make_unique<::arm_compute::CLHashtableLookup>();
+ 
+-  fn->configure(lookups_alloc->handle(), keys_alloc->handle(), values_alloc->handle(),
+-                output_alloc->handle(), hits_alloc->handle());
++  fn->configure(lookups_tensor->handle(), keys_tensor->handle(), values_tensor->handle(),
++                output_tensor->handle(), hits_tensor->handle());
+ 
+   auto acl_fn = asAclClFunction(std::move(fn));
+ 
+@@ -1489,13 +1302,13 @@ void KernelGenerator::visit(const ir::operation::PReLU &node)
+   const auto ifm_index{node.getInputs().at(ir::operation::PReLU::Input::INPUT)};
+   const auto alpha_index{node.getInputs().at(ir::operation::PReLU::Input::ALPHA)};
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+-  auto alpha_alloc = _tensor_builder->at(alpha_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
++  auto alpha_tensor = _tensor_builder->at(alpha_index).get();
+ 
+-  auto fn = std::make_unique<::arm_compute::CLPReLU>();
++  auto fn = std::make_unique<::arm_compute::CLPReluLayer>();
+ 
+-  fn->configure(ifm_alloc->handle(), alpha_alloc->handle(), ofm_alloc->handle());
++  fn->configure(ifm_tensor->handle(), alpha_tensor->handle(), ofm_tensor->handle());
+ 
+   auto acl_fn = asAclClFunction(std::move(fn));
+ 
+@@ -1518,7 +1331,6 @@ void KernelGenerator::visit(const ir::operation::TransposeConv &node)
+          (node.param().padding.type == ir::PaddingType::VALID));
+   auto padding = ir::calculatePadding(node.param().padding, ofm_shape, ifm_shape, stride,
+                                       ker_shape.W, ker_shape.H);
+-
+   uint32_t invalid_horizontal = 0;
+   uint32_t invalid_vertical = 0;
+   if (node.param().padding.type == ir::PaddingType::VALID)
+@@ -1528,17 +1340,17 @@ void KernelGenerator::visit(const ir::operation::TransposeConv &node)
+     invalid_vertical = ofm_shape.H - (1 + (ifm_shape.H - 1) * stride.vertical) - (ker_shape.H - 1);
+   }
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+-  auto ker_alloc = _tensor_builder->at(ker_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
++  auto ker_tensor = _tensor_builder->at(ker_index).get();
+ 
+   const auto tconv_info = acl_common::asPadStrideInfo(padding, stride);
+ 
+   auto fn = std::make_unique<::arm_compute::CLTransposeConvLayer>(
+       _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
+ 
+-  fn->configure(ifm_alloc->handle(), ker_alloc->handle(), nullptr, ofm_alloc->handle(), tconv_info,
+-                invalid_horizontal, invalid_vertical);
++  fn->configure(ifm_tensor->handle(), ker_tensor->handle(), nullptr, ofm_tensor->handle(),
++                tconv_info, invalid_horizontal, invalid_vertical);
+ 
+   auto acl_fn = asAclClFunction(std::move(fn));
+ 
+@@ -1550,15 +1362,15 @@ void KernelGenerator::visit(const ir::operation::SQRT &node)
+   const auto output_index{node.getOutputs().at(0)};
+   const auto input_index{node.getInputs().at(ir::operation::SQRT::Input::INPUT)};
+ 
+-  auto output_alloc = _tensor_builder->at(output_index).get();
+-  auto input_alloc = _tensor_builder->at(input_index).get();
++  auto output_tensor = _tensor_builder->at(output_index).get();
++  auto input_tensor = _tensor_builder->at(input_index).get();
+ 
+   const ::arm_compute::ActivationLayerInfo act_info{
+       ::arm_compute::ActivationLayerInfo::ActivationFunction::SQRT};
+ 
+   auto fn = std::make_unique<::arm_compute::CLActivationLayer>();
+ 
+-  fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
++  fn->configure(input_tensor->handle(), output_tensor->handle(), act_info);
+ 
+   auto acl_fn = asAclClFunction(std::move(fn));
+ 
+@@ -1571,13 +1383,13 @@ void KernelGenerator::visit(const ir::operation::LogicalOr &node)
+   const auto input0_index{node.getInputs().at(ir::operation::LogicalOr::Input::INPUT0)};
+   const auto input1_index{node.getInputs().at(ir::operation::LogicalOr::Input::INPUT1)};
+ 
+-  auto output_alloc = _tensor_builder->at(output_index).get();
+-  auto input0_alloc = _tensor_builder->at(input0_index).get();
+-  auto input1_alloc = _tensor_builder->at(input1_index).get();
++  auto output_tensor = _tensor_builder->at(output_index).get();
++  auto input0_tensor = _tensor_builder->at(input0_index).get();
++  auto input1_tensor = _tensor_builder->at(input1_index).get();
+ 
+   auto fn = std::make_unique<::arm_compute::CLBitwiseOr>();
+ 
+-  fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle());
++  fn->configure(input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle());
+ 
+   auto acl_fn = asAclClFunction(std::move(fn));
+ 
+@@ -1589,12 +1401,12 @@ void KernelGenerator::visit(const ir::operation::LogicalNot &node)
+   const auto output_index{node.getOutputs().at(0)};
+   const auto input_index{node.getInputs().at(ir::operation::LogicalNot::Input::INPUT)};
+ 
+-  auto output_alloc = _tensor_builder->at(output_index).get();
+-  auto input_alloc = _tensor_builder->at(input_index).get();
++  auto output_tensor = _tensor_builder->at(output_index).get();
++  auto input_tensor = _tensor_builder->at(input_index).get();
+ 
+   auto fn = std::make_unique<::arm_compute::CLBitwiseNot>();
+ 
+-  fn->configure(input_alloc->handle(), output_alloc->handle());
++  fn->configure(input_tensor->handle(), output_tensor->handle());
+ 
+   auto acl_fn = asAclClFunction(std::move(fn));
+ 
+@@ -1607,13 +1419,13 @@ void KernelGenerator::visit(const ir::operation::SquaredDifference &node)
+   const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)};
+   const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)};
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto lhs_alloc = _tensor_builder->at(lhs_index).get();
+-  auto rhs_alloc = _tensor_builder->at(rhs_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto lhs_tensor = _tensor_builder->at(lhs_index).get();
++  auto rhs_tensor = _tensor_builder->at(rhs_index).get();
+ 
+   auto fn = std::make_unique<::arm_compute::CLElementwiseSquaredDiff>();
+ 
+-  fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
++  fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
+ 
+   auto acl_fn = asAclClFunction(std::move(fn));
+ 
+@@ -1634,13 +1446,13 @@ void KernelGenerator::visit(const ir::operation::TopKV2 &node)
+ 
+   const auto k = node.param().k;
+ 
+-  auto values_alloc = _tensor_builder->at(outputValues_index).get();
+-  auto indices_alloc = _tensor_builder->at(outputIndices_index).get();
+-  auto input_alloc = _tensor_builder->at(inputData_index).get();
++  auto values_tensor = _tensor_builder->at(outputValues_index).get();
++  auto indices_tensor = _tensor_builder->at(outputIndices_index).get();
++  auto input_tensor = _tensor_builder->at(inputData_index).get();
+ 
+   auto fn = std::make_unique<::arm_compute::CLTopKV2>();
+ 
+-  fn->configure(input_alloc->handle(), k, values_alloc->handle(), indices_alloc->handle());
++  fn->configure(input_tensor->handle(), k, values_tensor->handle(), indices_tensor->handle());
+ 
+   auto acl_fn = asAclClFunction(std::move(fn));
+ 
+@@ -1659,9 +1471,9 @@ void KernelGenerator::visit(const ir::operation::Gather &node)
+   const auto axis_value = (axis_raw < 0 ? (ifm_rank + axis_raw) : axis_raw);
+   const int axis = ::onert::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value).value();
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+-  auto indices_alloc = _tensor_builder->at(indices_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
++  auto indices_tensor = _tensor_builder->at(indices_index).get();
+ 
+   // NOTE The frontend layout and backend layout must be the same for this operation.
+   //      If not the same, we have to add a stage(?) to perform permutation of output tensor. It
+@@ -1671,43 +1483,43 @@ void KernelGenerator::visit(const ir::operation::Gather &node)
+   //      a model. For example, if a model in NHWC has this operation as output rank == 4, indices
+   //      rank == 2 and axis == 2, this operation should work as the axis W and C, but the axis W
+   //      and C are not sequential in NCHW. So the backend in NCHW cannot handle this case.
+-  const auto backend_layout = ofm_alloc->layout();
++  const auto backend_layout = ofm_tensor->layout();
+   UNUSED_RELEASE(backend_layout);
+-  assert(backend_layout == ifm_alloc->layout());
+-  assert(backend_layout == indices_alloc->layout());
++  assert(backend_layout == ifm_tensor->layout());
++  assert(backend_layout == indices_tensor->layout());
+   assert(ifm_rank < 4 || _current_op_seq_layout == backend_layout);
+ 
+   auto fn = std::make_unique<::arm_compute::CLGatherEx>();
+ 
+   // input is n-D, indices k-D, output is (n + k - 1)-D
+   size_t n = ifm_rank;
+-  assert(n == ifm_alloc->num_dimensions());
++  assert(n == ifm_tensor->num_dimensions());
+   size_t k = _ctx.at(indices_index).shape().rank();
+-  assert(k == indices_alloc->num_dimensions());
++  assert(k == indices_tensor->num_dimensions());
+ 
+   // Disable applied dim_correction
+-  const auto orig_ifm_acl_tensor_shape = ifm_alloc->info()->tensor_shape();
+-  if (n != ifm_alloc->info()->num_dimensions())
++  const auto orig_ifm_acl_tensor_shape = ifm_tensor->info()->tensor_shape();
++  if (n != ifm_tensor->info()->num_dimensions())
+   {
+     // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
+     const auto ifm = _ctx.at(ifm_index);
+-    ifm_alloc->info()->set_tensor_shape(
++    ifm_tensor->info()->set_tensor_shape(
+         acl_common::asTensorShape(ifm.shape(), _current_op_seq_layout, backend_layout, false));
+   }
+-  const auto orig_indice_acl_tensor_shape = indices_alloc->info()->tensor_shape();
+-  if (k != indices_alloc->info()->num_dimensions())
++  const auto orig_indice_acl_tensor_shape = indices_tensor->info()->tensor_shape();
++  if (k != indices_tensor->info()->num_dimensions())
+   {
+     // This means that high dimension's value is 1 and indices tensor is applied dim_correction
+     const auto indices = _ctx.at(indices_index);
+-    indices_alloc->info()->set_tensor_shape(
++    indices_tensor->info()->set_tensor_shape(
+         acl_common::asTensorShape(indices.shape(), _current_op_seq_layout, backend_layout, false));
+   }
+ 
+-  fn->configure(ifm_alloc->handle(), indices_alloc->handle(), ofm_alloc->handle(), axis);
++  fn->configure(ifm_tensor->handle(), indices_tensor->handle(), ofm_tensor->handle(), axis);
+ 
+   // Revert disabling applied dim_correction
+-  ifm_alloc->info()->set_tensor_shape(orig_ifm_acl_tensor_shape);
+-  indices_alloc->info()->set_tensor_shape(orig_indice_acl_tensor_shape);
++  ifm_tensor->info()->set_tensor_shape(orig_ifm_acl_tensor_shape);
++  indices_tensor->info()->set_tensor_shape(orig_indice_acl_tensor_shape);
+ 
+   auto acl_fn = asAclClFunction(std::move(fn));
+ 
+@@ -1719,12 +1531,12 @@ void KernelGenerator::visit(const ir::operation::Neg &node)
+   const auto ofm_index{node.getOutputs().at(0)};
+   const auto ifm_index{node.getInputs().at(ir::operation::Neg::Input::INPUT)};
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+ 
+   auto fn = std::make_unique<::arm_compute::CLNeg>();
+ 
+-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle());
++  fn->configure(ifm_tensor->handle(), ofm_tensor->handle());
+ 
+   auto acl_fn = asAclClFunction(std::move(fn));
+ 
+@@ -1736,15 +1548,15 @@ void KernelGenerator::visit(const ir::operation::Abs &node)
+   const auto output_index{node.getOutputs().at(0)};
+   const auto input_index{node.getInputs().at(ir::operation::Abs::Input::INPUT)};
+ 
+-  auto output_alloc = _tensor_builder->at(output_index).get();
+-  auto input_alloc = _tensor_builder->at(input_index).get();
++  auto output_tensor = _tensor_builder->at(output_index).get();
++  auto input_tensor = _tensor_builder->at(input_index).get();
+ 
+   const ::arm_compute::ActivationLayerInfo act_info{
+       ::arm_compute::ActivationLayerInfo::ActivationFunction::ABS};
+ 
+   auto fn = std::make_unique<::arm_compute::CLActivationLayer>();
+ 
+-  fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
++  fn->configure(input_tensor->handle(), output_tensor->handle(), act_info);
+ 
+   auto acl_fn = asAclClFunction(std::move(fn));
+ 
+@@ -1761,11 +1573,11 @@ void KernelGenerator::visit(const ir::operation::ArgMax &node)
+ 
+   assert((ifm_shape.rank() - 1) == ofm_shape.rank());
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+   const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
+   auto frontend_layout = _current_op_seq_layout;
+-  auto backend_layout = ifm_alloc->layout();
++  auto backend_layout = ifm_tensor->layout();
+ 
+   int axis_value = node.param().axis;
+   if (axis_value < 0)
+@@ -1776,10 +1588,10 @@ void KernelGenerator::visit(const ir::operation::ArgMax &node)
+   auto acl_axis =
+       acl_common::ToARMComputeAxis(ifm_rank, axis_value, frontend_layout, backend_layout).value();
+ 
+-  auto fn = std::make_unique<::arm_compute::CLArgOperation>();
++  auto fn = std::make_unique<::arm_compute::CLArgMinMaxLayer>();
+ 
+-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), {acl_axis},
+-                ::arm_compute::ArgOperation::MAX);
++  fn->configure(ifm_tensor->handle(), acl_axis, ofm_tensor->handle(),
++                ::arm_compute::ReductionOperation::ARG_IDX_MAX);
+ 
+   auto acl_fn = asAclClFunction(std::move(fn));
+ 
+@@ -1791,12 +1603,12 @@ void KernelGenerator::visit(const ir::operation::Dequantize &node)
+   const auto output_index{node.getOutputs().at(0)};
+   const auto input_index{node.getInputs().at(ir::operation::Dequantize::Input::INPUT)};
+ 
+-  auto output_alloc = _tensor_builder->at(output_index).get();
+-  auto input_alloc = _tensor_builder->at(input_index).get();
++  auto output_tensor = _tensor_builder->at(output_index).get();
++  auto input_tensor = _tensor_builder->at(input_index).get();
+ 
+-  auto fn = std::make_unique<::arm_compute::CLCast>();
++  auto fn = std::make_unique<::arm_compute::CLDequantizationLayer>();
+ 
+-  fn->configure(input_alloc->handle(), output_alloc->handle(), arm_compute::SubDataType::NONE);
++  fn->configure(input_tensor->handle(), output_tensor->handle());
+ 
+   auto acl_fn = asAclClFunction(std::move(fn));
+ 
+@@ -1814,15 +1626,15 @@ void KernelGenerator::visit(const ir::operation::LocalResponseNormalization &nod
+   auto beta = node.param().beta;
+   auto bias = node.param().bias;
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+ 
+   const auto norm_info = ::arm_compute::NormalizationLayerInfo(
+       ::arm_compute::NormType::CROSS_MAP, radius * 2 + 1, alpha, beta, bias, false);
+ 
+   auto fn = std::make_unique<::arm_compute::CLNormalizationLayer>();
+ 
+-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), norm_info);
++  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
+ 
+   auto acl_fn = asAclClFunction(std::move(fn));
+ 
+@@ -1837,12 +1649,12 @@ void KernelGenerator::visit(const ir::operation::DepthToSpace &node)
+   auto block_size = node.param().block_size;
+   assert(block_size > 0);
+ 
+-  auto output_alloc = _tensor_builder->at(output_index).get();
+-  auto input_alloc = _tensor_builder->at(input_index).get();
++  auto output_tensor = _tensor_builder->at(output_index).get();
++  auto input_tensor = _tensor_builder->at(input_index).get();
+ 
+-  auto fn = std::make_unique<::arm_compute::CLDepthToSpace>();
++  auto fn = std::make_unique<::arm_compute::CLDepthToSpaceLayer>();
+ 
+-  fn->configure(input_alloc->handle(), output_alloc->handle(), block_size);
++  fn->configure(input_tensor->handle(), output_tensor->handle(), block_size);
+ 
+   auto acl_fn = asAclClFunction(std::move(fn));
+ 
+@@ -1860,13 +1672,13 @@ void KernelGenerator::visit(const ir::operation::Split &node)
+   for (const auto &output : node.getOutputs())
+     output_indexes.emplace_back(output);
+ 
+-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+-  std::vector<arm_compute::ICLTensor *> output_allocs;
++  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
++  std::vector<arm_compute::ICLTensor *> output_tensors;
+   for (const auto &ofm_ind : output_indexes)
+-    output_allocs.emplace_back(_tensor_builder->at(ofm_ind).get()->handle());
++    output_tensors.emplace_back(_tensor_builder->at(ofm_ind).get()->handle());
+ 
+   const auto frontend_layout = _current_op_seq_layout;
+-  const auto backend_layout = ifm_alloc->layout();
++  const auto backend_layout = ifm_tensor->layout();
+   auto axis = node.param().axis;
+   if (axis < 0)
+     axis += ifm_rank;
+@@ -1874,7 +1686,7 @@ void KernelGenerator::visit(const ir::operation::Split &node)
+ 
+   auto fn = std::make_unique<::arm_compute::CLSplit>();
+ 
+-  fn->configure(ifm_alloc->handle(), output_allocs, axis);
++  fn->configure(ifm_tensor->handle(), output_tensors, axis);
+ 
+   _return_fn = asAclClFunction(std::move(fn));
+ }
+@@ -1906,13 +1718,13 @@ void KernelGenerator::visit(const ir::operation::Unpack &node)
+   for (const auto &output_index : output_indexes)
+   {
+     size_t output_rank = _ctx.at(output_index).shape().rank();
+-    const auto &output_alloc = _tensor_builder->at(output_index);
+-    orig_outputs_acl_tensor_shapes.emplace_back(output_alloc->info()->tensor_shape());
+-    assert(output_rank == output_alloc->num_dimensions());
+-    if (output_rank != output_alloc->info()->num_dimensions())
++    const auto &output_tensor = _tensor_builder->at(output_index);
++    orig_outputs_acl_tensor_shapes.emplace_back(output_tensor->info()->tensor_shape());
++    assert(output_rank == output_tensor->num_dimensions());
++    if (output_rank != output_tensor->info()->num_dimensions())
+     {
+       // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
+-      output_alloc->info()->set_tensor_shape(acl_common::asTensorShape(
++      output_tensor->info()->set_tensor_shape(acl_common::asTensorShape(
+           _ctx.at(output_index).shape(), _current_op_seq_layout, backend_layout, false));
+     }
+   }
+@@ -1959,12 +1771,12 @@ void KernelGenerator::visit(const ir::operation::Pad &node)
+ 
+   // Disable applied dim_correction
+   size_t input_rank = _ctx.at(input_index).shape().rank();
+-  const auto &input_alloc = _tensor_builder->at(input_index);
+-  assert(input_rank == input_alloc->num_dimensions());
+-  if (input_rank != input_alloc->info()->num_dimensions())
++  const auto &input_tensor = _tensor_builder->at(input_index);
++  assert(input_rank == input_tensor->num_dimensions());
++  if (input_rank != input_tensor->info()->num_dimensions())
+   {
+     // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
+-    input_alloc->info()->set_tensor_shape(acl_common::asTensorShape(
++    input_tensor->info()->set_tensor_shape(acl_common::asTensorShape(
+         _ctx.at(input_index).shape(), frontend_layout, backend_layout, false));
+   }
+ 
+@@ -1982,13 +1794,13 @@ void KernelGenerator::visit(const ir::operation::Min &node)
+   const auto lhs_index{node.getInputs().at(ir::operation::Min::Input::LHS)};
+   const auto rhs_index{node.getInputs().at(ir::operation::Min::Input::RHS)};
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto lhs_alloc = _tensor_builder->at(lhs_index).get();
+-  auto rhs_alloc = _tensor_builder->at(rhs_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto lhs_tensor = _tensor_builder->at(lhs_index).get();
++  auto rhs_tensor = _tensor_builder->at(rhs_index).get();
+ 
+   auto fn = std::make_unique<::arm_compute::CLElementwiseMin>();
+ 
+-  fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
++  fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
+ 
+   auto acl_fn = asAclClFunction(std::move(fn));
+ 
+@@ -2001,13 +1813,13 @@ void KernelGenerator::visit(const ir::operation::Max &node)
+   const auto lhs_index{node.getInputs().at(ir::operation::Max::Input::LHS)};
+   const auto rhs_index{node.getInputs().at(ir::operation::Max::Input::RHS)};
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto lhs_alloc = _tensor_builder->at(lhs_index).get();
+-  auto rhs_alloc = _tensor_builder->at(rhs_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto lhs_tensor = _tensor_builder->at(lhs_index).get();
++  auto rhs_tensor = _tensor_builder->at(rhs_index).get();
+ 
+   auto fn = std::make_unique<::arm_compute::CLElementwiseMax>();
+ 
+-  fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
++  fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
+ 
+   auto acl_fn = asAclClFunction(std::move(fn));
+ 
+@@ -2019,12 +1831,12 @@ void KernelGenerator::visit(const ir::operation::ConvertFp32ToFp16 &node)
+   const auto ofm_index{node.getOutputs().at(0)};
+   const auto ifm_index{node.getInputs().at(ir::operation::ConvertFp32ToFp16::Input::INPUT)};
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+ 
+   auto fn = std::make_unique<::arm_compute::CLDepthConvertLayer>();
+ 
+-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), ::arm_compute::ConvertPolicy::SATURATE,
++  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::ConvertPolicy::SATURATE,
+                 0);
+ 
+   auto acl_fn = asAclClFunction(std::move(fn));
+@@ -2037,12 +1849,12 @@ void KernelGenerator::visit(const ir::operation::ConvertFp16ToFp32 &node)
+   const auto ofm_index{node.getOutputs().at(0)};
+   const auto ifm_index{node.getInputs().at(ir::operation::ConvertFp16ToFp32::Input::INPUT)};
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+ 
+   auto fn = std::make_unique<::arm_compute::CLDepthConvertLayer>();
+ 
+-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), ::arm_compute::ConvertPolicy::SATURATE,
++  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::ConvertPolicy::SATURATE,
+                 0);
+ 
+   auto acl_fn = asAclClFunction(std::move(fn));
+diff --git a/runtime/onert/backend/acl_common/AclKernelGen.h b/runtime/onert/backend/acl_common/AclKernelGen.h
+new file mode 100644
+index 0000000..6253434
+--- /dev/null
++++ b/runtime/onert/backend/acl_common/AclKernelGen.h
+@@ -0,0 +1,269 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *      http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#ifndef __ONERT_BACKEND_ACL_COMMON_ACL_KERNEL_GEN_H_
++#define __ONERT_BACKEND_ACL_COMMON_ACL_KERNEL_GEN_H_
++
++#include <exec/IFunction.h>
++#include <ir/Operands.h>
++
++#include <ir/operation/LSTM.h>
++
++namespace onert
++{
++namespace backend
++{
++namespace acl_common
++{
++
++template <typename T_FunctionWrapper, typename T_Tensor, typename T_ACLLayer,
++          typename T_TensorBuilder>
++std::unique_ptr<exec::IFunction>
++kernelGenLSTM(const ir::operation::LSTM &node, const ir::Operands &operands,
++              const std::shared_ptr<T_TensorBuilder> &tensor_builder)
++{
++  // TODO Support dynamic rnn
++  // TODO Fix subtle error in the case of non-CIFG, non-peephole and No Projection.
++  const auto scratch_buffer_index{
++      node.getOutputs().at(ir::operation::LSTM::Output::SCRATCH_BUFFER)};
++  const auto output_state_out_index{
++      node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT_STATE_OUT)};
++  const auto cell_state_out_index{
++      node.getOutputs().at(ir::operation::LSTM::Output::CELL_STATE_OUT)};
++  const auto output_index{node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT)};
++
++  const auto input_index{node.getInputs().at(ir::operation::LSTM::Input::INPUT)};
++  const auto input_to_input_weights_index{
++      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_INPUT_WEIGHTS)}; // optional
++  const auto input_to_forget_weights_index{
++      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_FORGET_WEIGHTS)};
++  const auto input_to_cell_weights_index{
++      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_CELL_WEIGHTS)};
++  const auto input_to_output_weights_index{
++      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_OUTPUT_WEIGHTS)};
++  const auto recurrent_to_input_weights_index{
++      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_INPUT_WEIGHTS)}; // optional
++  const auto recurrent_to_forget_weights_index{
++      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_FORGET_WEIGHTS)};
++  const auto recurrent_to_cell_weights_index{
++      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_CELL_WEIGHTS)};
++  const auto recurrent_to_output_weights_index{
++      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_OUTPUT_WEIGHTS)};
++  const auto cell_to_input_weights_index{
++      node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_INPUT_WEIGHTS)}; // optional
++  const auto cell_to_forget_weights_index{
++      node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_FORGET_WEIGHTS)}; // optional
++  const auto cell_to_output_weights_index{
++      node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_OUTPUT_WEIGHTS)}; // optional
++  const auto input_gate_bias_index{
++      node.getInputs().at(ir::operation::LSTM::Input::INPUT_GATE_BIAS)};
++  const auto forget_gate_bias_index{
++      node.getInputs().at(ir::operation::LSTM::Input::FORGET_GATE_BIAS)};
++  const auto cell_bias_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_BIAS)};
++  const auto output_gate_bias_index{
++      node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_GATE_BIAS)};
++  const auto projection_weights_index{
++      node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_WEIGHTS)}; // optional
++  const auto projection_bias_index{
++      node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_BIAS)}; // optional
++  const auto output_state_in_index{
++      node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_STATE_IN)};
++  const auto cell_state_in_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_STATE_IN)};
++  const auto cell_threshold = node.param().cell_threshold;
++  const auto projection_threshold = node.param().projection_threshold;
++
++  bool has_input_to_input_weights = operands.at(input_to_input_weights_index).shape().dim(0) != 0 &&
++                                    operands.at(input_to_input_weights_index).shape().dim(1) != 0;
++  bool has_recurrent_to_input_weights =
++      operands.at(recurrent_to_input_weights_index).shape().dim(0) != 0 &&
++      operands.at(recurrent_to_input_weights_index).shape().dim(1) != 0;
++  bool has_cell_to_forget_weights = operands.at(cell_to_forget_weights_index).shape().dim(0) != 0;
++  bool has_cell_to_output_weights = operands.at(cell_to_output_weights_index).shape().dim(0) != 0;
++  bool has_projection_weights = operands.at(projection_weights_index).shape().dim(0) != 0 &&
++                                operands.at(projection_weights_index).shape().dim(1) != 0;
++  bool has_projection_bias = operands.at(projection_bias_index).shape().dim(0);
++
++  // NOTE The input_to_input_weights and the recurrent_to_input_weights do not exist in CIFG.
++  // true: no CIFG
++  // false: CIFG
++  // NOTE The cell_to_input_weights does not exist in non-peephole although regular LSTM(non-CIFG).
++  bool has_cifg_param = has_input_to_input_weights && has_recurrent_to_input_weights;
++
++  // NOTE The cell_to_forget_weights and the cell_to_output_weights exist in peephole.
++  // But the cell_to_input_weights does not exist in regular CIFG although peephole.
++  // true: peephole
++  // false: no peephole
++  bool has_peephole_param = has_cell_to_forget_weights && has_cell_to_output_weights;
++
++  // NOTE Although the projection weights has data the projection bias may not have data.
++  bool has_projection_param = has_projection_weights;
++
++  const auto activation = node.param().activation;
++  const auto cell_clip = cell_threshold;
++  const auto projection_clip = projection_threshold;
++  assert(cell_clip >= 0.f && projection_clip >= 0.f);
++
++  auto scratch_buffer_tensor = tensor_builder->at(scratch_buffer_index).get();
++  auto output_state_out_tensor = tensor_builder->at(output_state_out_index).get();
++  auto cell_state_out_tensor = tensor_builder->at(cell_state_out_index).get();
++  auto output_tensor = tensor_builder->at(output_index).get();
++
++  auto input_tensor = tensor_builder->at(input_index).get();
++
++  auto input_to_forget_weights_tensor = tensor_builder->at(input_to_forget_weights_index).get();
++  auto input_to_cell_weights_tensor = tensor_builder->at(input_to_cell_weights_index).get();
++  auto input_to_output_weights_tensor = tensor_builder->at(input_to_output_weights_index).get();
++  auto recurrent_to_forget_weights_tensor =
++      tensor_builder->at(recurrent_to_forget_weights_index).get();
++  auto recurrent_to_cell_weights_tensor = tensor_builder->at(recurrent_to_cell_weights_index).get();
++  auto recurrent_to_output_weights_tensor =
++      tensor_builder->at(recurrent_to_output_weights_index).get();
++
++  auto forget_gate_bias_tensor = tensor_builder->at(forget_gate_bias_index).get();
++  auto cell_bias_tensor = tensor_builder->at(cell_bias_index).get();
++  auto output_gate_bias_tensor = tensor_builder->at(output_gate_bias_index).get();
++  auto output_state_in_tensor = tensor_builder->at(output_state_in_index).get();
++  auto cell_state_in_tensor = tensor_builder->at(cell_state_in_index).get();
++
++  auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation);
++
++  auto fn = std::make_unique<T_ACLLayer>();
++
++  ::arm_compute::LSTMParams<T_Tensor> lstm_params{};
++  if (has_cifg_param)
++  {
++    auto input_to_input_weights_tensor =
++        tensor_builder->at(input_to_input_weights_index).get(); // optional
++    auto recurrent_to_input_weights_tensor =
++        tensor_builder->at(recurrent_to_input_weights_index).get(); // optional
++    auto cell_to_input_weights_handle =
++        has_peephole_param ? tensor_builder->at(cell_to_input_weights_index).get()->handle()
++                           : nullptr; // optional (non-cifg && peephole)
++    auto input_gate_bias_tensor = tensor_builder->at(input_gate_bias_index).get(); // optional
++    lstm_params.set_cifg_params(input_to_input_weights_tensor->handle(),
++                                recurrent_to_input_weights_tensor->handle(),
++                                cell_to_input_weights_handle, input_gate_bias_tensor->handle());
++  }
++  if (has_peephole_param)
++  {
++    auto cell_to_forget_weights_tensor =
++        tensor_builder->at(cell_to_forget_weights_index).get(); // optional
++    auto cell_to_output_weights_tensor =
++        tensor_builder->at(cell_to_output_weights_index).get(); // optional
++    lstm_params.set_peephole_params(cell_to_forget_weights_tensor->handle(),
++                                    cell_to_output_weights_tensor->handle());
++  }
++  if (has_projection_param)
++  {
++    auto projection_weights_tensor = tensor_builder->at(projection_weights_index).get(); // optional
++    auto projection_bias_handle = has_projection_bias
++                                      ? tensor_builder->at(projection_bias_index).get()->handle()
++                                      : nullptr; // optional
++    lstm_params.set_projection_params(projection_weights_tensor->handle(), projection_bias_handle);
++  }
++
++  fn->configure(input_tensor->handle(), input_to_forget_weights_tensor->handle(),
++                input_to_cell_weights_tensor->handle(), input_to_output_weights_tensor->handle(),
++                recurrent_to_forget_weights_tensor->handle(),
++                recurrent_to_cell_weights_tensor->handle(),
++                recurrent_to_output_weights_tensor->handle(), forget_gate_bias_tensor->handle(),
++                cell_bias_tensor->handle(), output_gate_bias_tensor->handle(),
++                output_state_in_tensor->handle(), cell_state_in_tensor->handle(),
++                scratch_buffer_tensor->handle(), output_state_out_tensor->handle(),
++                cell_state_out_tensor->handle(), output_tensor->handle(), lstm_params, act_info,
++                cell_clip, projection_clip);
++
++  return std::make_unique<T_FunctionWrapper>(std::move(fn));
++}
++
++template <typename T_FunctionWrapper, typename T_Tensor, typename T_ACLLayer,
++          typename T_TensorBuilder>
++std::unique_ptr<exec::IFunction>
++kernelGenFullyConnected(const ir::operation::FullyConnected &node, const ir::Operands &operands,
++                        const std::shared_ptr<T_TensorBuilder> &tensor_builder, ir::Layout layout)
++{
++  using ir::operation::FullyConnected;
++
++  const auto output_index{node.getOutputs().at(0)};
++  const auto input_index{node.getInputs().at(FullyConnected::Input::INPUT)};
++  const auto weight_index{node.getInputs().at(FullyConnected::Input::WEIGHT)};
++  const auto bias_index{node.getInputs().at(FullyConnected::Input::BIAS)};
++
++  const auto input_rank = operands.at(input_index).shape().rank();
++
++  const auto output_size =
++      operands.at(output_index).shape().dim(operands.at(output_index).shape().rank() - 1);
++  UNUSED_RELEASE(output_size);
++  assert(operands.at(bias_index).shape().dim(0) == output_size);
++  assert(operands.at(weight_index).shape().dim(0) == output_size);
++  const auto batch_size =
++      operands.at(output_index).shape().dim(operands.at(output_index).shape().rank() - 2);
++  const auto input_size =
++      operands.at(weight_index).shape().dim(operands.at(weight_index).shape().rank() - 1);
++
++  // Check for reshaping input's shape into rank-2
++  bool needs_reshape = false;
++  ir::Shape reshape(2);
++  if (input_rank == 3 || input_rank == 4)
++  {
++    const auto &ifm_shape = operands.at(input_index).shape();
++    auto feature_size = 1;
++    for (int i = 0; i < ifm_shape.rank(); ++i)
++    {
++      feature_size *= ifm_shape.dim(i);
++    }
++
++    UNUSED_RELEASE(feature_size);
++    assert(feature_size == batch_size * input_size);
++
++    // for reshaping
++    needs_reshape = true;
++    reshape.dim(0) = batch_size; /* H */
++    reshape.dim(1) = input_size; /* W */
++  }
++
++  auto output_tensor = tensor_builder->at(output_index).get();
++  const auto input_tensor = tensor_builder->at(input_index).get();
++  const auto weight_tensor = tensor_builder->at(weight_index).get();
++  const auto bias_tensor = tensor_builder->at(bias_index).get();
++  const auto frontend_layout = layout;
++  const auto acl_layout = output_tensor->handle()->info()->data_layout();
++
++  auto fn =
++      std::make_unique<T_ACLLayer>(tensor_builder->acl_tensor_manager()->internal_buffer_manager());
++
++  typename T_ACLLayer::KernelType kernel_type = T_ACLLayer::KernelType::GENERAL;
++  if (operands.at(weight_index).isConstant())
++  {
++    kernel_type = T_ACLLayer::KernelType::PREPROCESSED_WEIGHTS;
++    assert(operands.at(weight_index).data());
++  }
++
++  fn->configure(
++      input_tensor->handle(), weight_tensor->handle(), bias_tensor->handle(),
++      output_tensor->handle(), needs_reshape,
++      ::onert::backend::acl_common::asTensorShape(
++          reshape, frontend_layout, ::onert::backend::acl_common::asRuntimeLayout(acl_layout)),
++      kernel_type);
++
++  return std::make_unique<T_FunctionWrapper>(std::move(fn));
++}
++
++} // namespace acl_common
++} // namespace backend
++} // namespace onert
++
++#endif // __ONERT_BACKEND_ACL_COMMON_ACL_KERNEL_GEN_H_
+diff --git a/runtime/onert/backend/acl_neon/KernelGenerator.cc b/runtime/onert/backend/acl_neon/KernelGenerator.cc
+index e471867..37ec993 100644
+--- a/runtime/onert/backend/acl_neon/KernelGenerator.cc
++++ b/runtime/onert/backend/acl_neon/KernelGenerator.cc
+@@ -31,6 +31,7 @@
+ #include "exec/NopFunction.h"
+ #include "util/logging.h"
+ #include "util/Utils.h"
++#include "AclKernelGen.h"
+ 
+ namespace onert
+ {
+@@ -74,15 +75,15 @@ void KernelGenerator::visit(const ir::operation::Abs &node)
+   const auto output_index{node.getOutputs().at(0)};
+   const auto input_index{node.getInputs().at(ir::operation::Abs::Input::INPUT)};
+ 
+-  auto output_alloc = _tensor_builder->at(output_index).get();
+-  auto input_alloc = _tensor_builder->at(input_index).get();
++  auto output_tensor = _tensor_builder->at(output_index).get();
++  auto input_tensor = _tensor_builder->at(input_index).get();
+ 
+   const ::arm_compute::ActivationLayerInfo act_info{
+       ::arm_compute::ActivationLayerInfo::ActivationFunction::ABS};
+ 
+   auto fn = std::make_unique<::arm_compute::NEActivationLayer>();
+ 
+-  fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
++  fn->configure(input_tensor->handle(), output_tensor->handle(), act_info);
+ 
+   auto acl_fn = asAclFunction(std::move(fn));
+ 
+@@ -96,10 +97,10 @@ void KernelGenerator::visit(const ir::operation::ArgMax &node)
+ 
+   const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+   auto frontend_layout = _current_op_seq_layout;
+-  auto backend_layout = ifm_alloc->layout();
++  auto backend_layout = ifm_tensor->layout();
+ 
+   int axis_value = node.param().axis;
+   if (axis_value < 0)
+@@ -112,7 +113,7 @@ void KernelGenerator::visit(const ir::operation::ArgMax &node)
+ 
+   auto fn = std::make_unique<::arm_compute::NEArgMinMaxLayer>();
+ 
+-  fn->configure(ifm_alloc->handle(), fixed_axis, ofm_alloc->handle(),
++  fn->configure(ifm_tensor->handle(), fixed_axis, ofm_tensor->handle(),
+                 arm_compute::ReductionOperation::ARG_IDX_MAX);
+ 
+   auto acl_fn = asAclFunction(std::move(fn));
+@@ -127,15 +128,15 @@ void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node)
+   const auto block_size_index{
+       node.getInputs().at(ir::operation::BatchToSpaceND::Input::BLOCK_SIZE)};
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+-  auto block_size_alloc = _tensor_builder->at(block_size_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
++  auto block_size_tensor = _tensor_builder->at(block_size_index).get();
+ 
+   assert(_ctx.at(block_size_index).data());
+ 
+   auto fn = std::make_unique<::arm_compute::NEBatchToSpaceLayer>();
+ 
+-  fn->configure(ifm_alloc->handle(), block_size_alloc->handle(), ofm_alloc->handle());
++  fn->configure(ifm_tensor->handle(), block_size_tensor->handle(), ofm_tensor->handle());
+ 
+   auto acl_fn = asAclFunction(std::move(fn));
+ 
+@@ -147,15 +148,26 @@ void KernelGenerator::visit(const ir::operation::Cast &node)
+   const auto ofm_index{node.getOutputs().at(0)};
+   const auto ifm_index{node.getInputs().at(ir::operation::Cast::Input::INPUT)};
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+ 
+-  auto fn = std::make_unique<::arm_compute::NECast>();
++  std::unique_ptr<::arm_compute::IFunction> fn;
++  if (ifm_tensor->data_type() == ofm_tensor->data_type())
++  {
++    auto l = std::make_unique<::arm_compute::NECopy>();
++
++    l->configure(ifm_tensor->handle(), ofm_tensor->handle());
++
++    fn = std::move(l);
++  }
++  else
++  {
++    auto l = std::make_unique<::arm_compute::NECast>();
+ 
+-  auto input_sub_type = _ctx.at(ifm_index).typeInfo().type() == ir::DataType::BOOL8
+-                            ? arm_compute::SubDataType::BOOL
+-                            : arm_compute::SubDataType::NONE;
+-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), input_sub_type);
++    l->configure(ifm_tensor->handle(), ofm_tensor->handle(), arm_compute::ConvertPolicy::SATURATE);
++
++    fn = std::move(l);
++  }
+ 
+   auto acl_fn = asAclFunction(std::move(fn));
+ 
+@@ -183,10 +195,10 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node)
+                                             ker_width, ker_height);
+   const auto activation = node.param().activation;
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+-  auto ker_alloc = _tensor_builder->at(ker_index).get();
+-  auto bias_alloc = _tensor_builder->at(bias_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
++  auto ker_tensor = _tensor_builder->at(ker_index).get();
++  auto bias_tensor = _tensor_builder->at(bias_index).get();
+ 
+   const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
+   const auto act_info = acl_common::asActivationLayerInfo(activation);
+@@ -194,8 +206,9 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node)
+   auto fn = std::make_unique<::arm_compute::NEConvolutionLayer>(
+       _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
+ 
+-  fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(), ofm_alloc->handle(),
+-                conv_info, ::arm_compute::WeightsInfo(), ::arm_compute::Size2D(1U, 1U), act_info);
++  fn->configure(ifm_tensor->handle(), ker_tensor->handle(), bias_tensor->handle(),
++                ofm_tensor->handle(), conv_info, ::arm_compute::WeightsInfo(),
++                ::arm_compute::Size2D(1U, 1U), act_info);
+ 
+   _return_fn = asAclFunction(std::move(fn));
+ }
+@@ -208,12 +221,12 @@ void KernelGenerator::visit(const ir::operation::DepthToSpace &node)
+   auto block_size = node.param().block_size;
+   assert(block_size > 0);
+ 
+-  auto output_alloc = _tensor_builder->at(output_index).get();
+-  auto input_alloc = _tensor_builder->at(input_index).get();
++  auto output_tensor = _tensor_builder->at(output_index).get();
++  auto input_tensor = _tensor_builder->at(input_index).get();
+ 
+-  auto fn = std::make_unique<::arm_compute::NEDepthToSpaceLayerEx>();
++  auto fn = std::make_unique<::arm_compute::NEDepthToSpaceLayer>();
+ 
+-  fn->configure(input_alloc->handle(), output_alloc->handle(), block_size);
++  fn->configure(input_tensor->handle(), output_tensor->handle(), block_size);
+ 
+   auto acl_fn = asAclFunction(std::move(fn));
+ 
+@@ -242,10 +255,10 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
+   const auto multiplier = node.param().multiplier;
+   const auto activation = node.param().activation;
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+-  auto ker_alloc = _tensor_builder->at(ker_index).get();
+-  auto bias_alloc = _tensor_builder->at(bias_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
++  auto ker_tensor = _tensor_builder->at(ker_index).get();
++  auto bias_tensor = _tensor_builder->at(bias_index).get();
+ 
+   const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
+   const auto act_info = acl_common::asActivationLayerInfo(activation);
+@@ -253,8 +266,8 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
+   {
+     auto fn = std::make_unique<::arm_compute::NEDepthwiseConvolutionLayer>();
+ 
+-    fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(),
+-                  ofm_alloc->handle(), conv_info, multiplier, act_info);
++    fn->configure(ifm_tensor->handle(), ker_tensor->handle(), bias_tensor->handle(),
++                  ofm_tensor->handle(), conv_info, multiplier, act_info);
+ 
+     _return_fn = asAclFunction(std::move(fn));
+   }
+@@ -265,12 +278,12 @@ void KernelGenerator::visit(const ir::operation::Dequantize &node)
+   const auto output_index{node.getOutputs().at(0)};
+   const auto input_index{node.getInputs().at(ir::operation::Dequantize::Input::INPUT)};
+ 
+-  auto output_alloc = _tensor_builder->at(output_index).get();
+-  auto input_alloc = _tensor_builder->at(input_index).get();
++  auto output_tensor = _tensor_builder->at(output_index).get();
++  auto input_tensor = _tensor_builder->at(input_index).get();
+ 
+   auto fn = std::make_unique<::arm_compute::NEDequantizationLayer>();
+ 
+-  fn->configure(input_alloc->handle(), output_alloc->handle());
++  fn->configure(input_tensor->handle(), output_tensor->handle());
+ 
+   auto acl_fn = asAclFunction(std::move(fn));
+ 
+@@ -305,19 +318,19 @@ void KernelGenerator::visit(const ir::operation::MaxPool2D &node)
+   VERBOSE(MaxPool2D) << "PAD(L): " << padding.left << std::endl;
+   VERBOSE(MaxPool2D) << "PAD(R): " << padding.right << std::endl;
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+ 
+-  ::arm_compute::PoolingLayerInfo info{::arm_compute::PoolingType::MAX,
+-                                       ::arm_compute::Size2D{kw, kh},
+-                                       acl_common::asPadStrideInfo(padding, stride)};
++  ::arm_compute::PoolingLayerInfo info{
++      ::arm_compute::PoolingType::MAX, ::arm_compute::Size2D{kw, kh},
++      ifm_tensor->info()->data_layout(), acl_common::asPadStrideInfo(padding, stride)};
+ 
+   auto fn = std::make_unique<::arm_compute::NEPoolingLayer>();
+ 
+-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info);
++  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), info);
+ 
+   _return_fn = std::make_unique<exec::FunctionSequence>(
+-      asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
++      asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
+ }
+ 
+ void KernelGenerator::visit(const ir::operation::AvgPool2D &node)
+@@ -348,19 +361,20 @@ void KernelGenerator::visit(const ir::operation::AvgPool2D &node)
+   VERBOSE(AvgPool2D) << "PAD(L): " << padding.left << std::endl;
+   VERBOSE(AvgPool2D) << "PAD(R): " << padding.right << std::endl;
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+ 
+   ::arm_compute::PoolingLayerInfo info{
+       ::arm_compute::PoolingType::AVG, ::arm_compute::Size2D{kw, kh},
+-      acl_common::asPadStrideInfo(padding, stride), true /* exclude_padding */};
++      ifm_tensor->info()->data_layout(), acl_common::asPadStrideInfo(padding, stride),
++      true /* exclude_padding */};
+ 
+   auto fn = std::make_unique<::arm_compute::NEPoolingLayer>();
+ 
+-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info);
++  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), info);
+ 
+   _return_fn = std::make_unique<exec::FunctionSequence>(
+-      asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
++      asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
+ }
+ 
+ void KernelGenerator::visit(const ir::operation::Concat &node)
+@@ -383,7 +397,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
+     return;
+   }
+ 
+-  auto output_alloc = _tensor_builder->at(ofm_index).get();
++  auto output_tensor = _tensor_builder->at(ofm_index).get();
+   std::vector<::arm_compute::ITensor *> input_tensors;
+   for (const auto &ifm_ind : input_indexes)
+     input_tensors.emplace_back(_tensor_builder->at(ifm_ind)->handle());
+@@ -392,7 +406,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
+   if (input_indexes.size() < 2)
+   {
+     auto l = std::make_unique<::arm_compute::NECopy>();
+-    l->configure(input_tensors.at(0), output_alloc->handle());
++    l->configure(input_tensors.at(0), output_tensor->handle());
+     fn = std::move(l);
+   }
+   else
+@@ -400,10 +414,10 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
+     auto l = std::make_unique<::arm_compute::NEConcatenateLayer>();
+     const auto rank = _ctx.at(ofm_index).shape().rank();
+     const auto frontend_layout = _current_op_seq_layout;
+-    const auto backend_layout = output_alloc->layout();
++    const auto backend_layout = output_tensor->layout();
+     const auto fixed_axis =
+         acl_common::ToARMComputeAxis(rank, axis, frontend_layout, backend_layout).value();
+-    l->configure(input_tensors, output_alloc->handle(), fixed_axis);
++    l->configure(input_tensors, output_tensor->handle(), fixed_axis);
+     fn = std::move(l);
+   }
+ 
+@@ -418,13 +432,13 @@ void KernelGenerator::visit(const ir::operation::EmbeddingLookup &node)
+   const auto lookups_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::LOOKUPS)};
+   const auto values_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::VALUES)};
+ 
+-  auto output_alloc = _tensor_builder->at(output_index).get();
+-  auto lookups_alloc = _tensor_builder->at(lookups_index).get();
+-  auto values_alloc = _tensor_builder->at(values_index).get();
++  auto output_tensor = _tensor_builder->at(output_index).get();
++  auto lookups_tensor = _tensor_builder->at(lookups_index).get();
++  auto values_tensor = _tensor_builder->at(values_index).get();
+ 
+   auto fn = std::make_unique<::arm_compute::NEEmbeddingLookup>();
+ 
+-  fn->configure(values_alloc->handle(), output_alloc->handle(), lookups_alloc->handle());
++  fn->configure(values_tensor->handle(), output_tensor->handle(), lookups_tensor->handle());
+ 
+   auto acl_fn = asAclFunction(std::move(fn));
+ 
+@@ -436,12 +450,12 @@ void KernelGenerator::visit(const ir::operation::Floor &node)
+   const auto ofm_index{node.getOutputs().at(0)};
+   const auto ifm_index{node.getInputs().at(ir::operation::Floor::Input::INPUT)};
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+ 
+   auto fn = std::make_unique<::arm_compute::NEFloor>();
+ 
+-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle());
++  fn->configure(ifm_tensor->handle(), ofm_tensor->handle());
+ 
+   auto acl_fn = asAclFunction(std::move(fn));
+ 
+@@ -450,76 +464,15 @@ void KernelGenerator::visit(const ir::operation::Floor &node)
+ 
+ void KernelGenerator::visit(const ir::operation::FullyConnected &node)
+ {
+-  using ir::operation::FullyConnected;
+-
+   const auto output_index{node.getOutputs().at(0)};
+-  const auto input_index{node.getInputs().at(FullyConnected::Input::INPUT)};
+-  const auto weight_index{node.getInputs().at(FullyConnected::Input::WEIGHT)};
+-  const auto bias_index{node.getInputs().at(FullyConnected::Input::BIAS)};
+-
+-  const auto input_rank = _ctx.at(input_index).shape().rank();
+-
+-  const auto output_size =
+-      _ctx.at(output_index).shape().dim(_ctx.at(output_index).shape().rank() - 1);
+-  UNUSED_RELEASE(output_size);
+-  assert(_ctx.at(bias_index).shape().dim(0) == output_size);
+-  assert(_ctx.at(weight_index).shape().dim(0) == output_size);
+-  const auto batch_size =
+-      _ctx.at(output_index).shape().dim(_ctx.at(output_index).shape().rank() - 2);
+-  const auto input_size =
+-      _ctx.at(weight_index).shape().dim(_ctx.at(weight_index).shape().rank() - 1);
+-
+-  // Check for reshaping input's shape into rank-2
+-  bool needs_reshape = false;
+-  ir::Shape reshape(2);
+-  if (input_rank == 3 || input_rank == 4)
+-  {
+-    const auto &ifm_shape = _ctx.at(input_index).shape();
+-    auto feature_size = 1;
+-    for (int i = 0; i < ifm_shape.rank(); ++i)
+-    {
+-      feature_size *= ifm_shape.dim(i);
+-    }
+-
+-    UNUSED_RELEASE(feature_size);
+-    assert(feature_size == batch_size * input_size);
+-
+-    // for reshaping
+-    needs_reshape = true;
+-    reshape.dim(0) = batch_size; /* H */
+-    reshape.dim(1) = input_size; /* W */
+-  }
+-
++  auto output_tensor = _tensor_builder->at(output_index).get();
+   const auto activation = node.param().activation;
+ 
+-  auto output_alloc = _tensor_builder->at(output_index).get();
+-  const auto input_alloc = _tensor_builder->at(input_index).get();
+-  const auto weight_alloc = _tensor_builder->at(weight_index).get();
+-  const auto bias_alloc = _tensor_builder->at(bias_index).get();
+-  const auto frontend_layout = _current_op_seq_layout;
+-  const auto acl_layout = output_alloc->handle()->info()->data_layout();
+-
+-  auto fn = std::make_unique<arm_compute::NEFullyConnectedReshapingLayer>(
+-      _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
+-
+-  arm_compute::NEFullyConnectedReshapingLayer::KernelType kernel_type =
+-      arm_compute::NEFullyConnectedReshapingLayer::KernelType::GENERAL;
+-  if (_ctx.at(weight_index).isConstant())
+-  {
+-    kernel_type = arm_compute::NEFullyConnectedReshapingLayer::KernelType::PREPROCESSED_WEIGHTS;
+-    assert(_ctx.at(weight_index).data());
+-  }
+-
+-  fn->configure(
+-      input_alloc->handle(), weight_alloc->handle(), bias_alloc->handle(), output_alloc->handle(),
+-      needs_reshape,
+-      ::onert::backend::acl_common::asTensorShape(
+-          reshape, frontend_layout, ::onert::backend::acl_common::asRuntimeLayout(acl_layout)),
+-      kernel_type);
+-
++  auto fn = acl_common::kernelGenFullyConnected<acl_common::AclFunction, ::arm_compute::ITensor,
++                                                ::arm_compute::NEFullyConnectedReshapingLayer>(
++      node, _ctx, _tensor_builder, _current_op_seq_layout);
+   _return_fn = std::make_unique<exec::FunctionSequence>(
+-      asAclFunction(std::move(fn)),
+-      ActivationBuilder::generate(activation, output_alloc->handle()));
++      std::move(fn), ActivationBuilder::generate(activation, output_tensor->handle()));
+ }
+ 
+ void KernelGenerator::visit(const ir::operation::HashtableLookup &node)
+@@ -531,17 +484,17 @@ void KernelGenerator::visit(const ir::operation::HashtableLookup &node)
+   const auto keys_index{node.getInputs().at(ir::operation::HashtableLookup::Input::KEYS)};
+   const auto values_index{node.getInputs().at(ir::operation::HashtableLookup::Input::VALUES)};
+ 
+-  auto output_alloc = _tensor_builder->at(output_index).get();
+-  auto hits_alloc = _tensor_builder->at(hits_index).get();
++  auto output_tensor = _tensor_builder->at(output_index).get();
++  auto hits_tensor = _tensor_builder->at(hits_index).get();
+ 
+-  auto lookups_alloc = _tensor_builder->at(lookups_index).get();
+-  auto keys_alloc = _tensor_builder->at(keys_index).get();
+-  auto values_alloc = _tensor_builder->at(values_index).get();
++  auto lookups_tensor = _tensor_builder->at(lookups_index).get();
++  auto keys_tensor = _tensor_builder->at(keys_index).get();
++  auto values_tensor = _tensor_builder->at(values_index).get();
+ 
+   auto fn = std::make_unique<::arm_compute::NEHashtableLookup>();
+ 
+-  fn->configure(lookups_alloc->handle(), keys_alloc->handle(), values_alloc->handle(),
+-                output_alloc->handle(), hits_alloc->handle());
++  fn->configure(lookups_tensor->handle(), keys_tensor->handle(), values_tensor->handle(),
++                output_tensor->handle(), hits_tensor->handle());
+ 
+   auto acl_fn = asAclFunction(std::move(fn));
+ 
+@@ -561,10 +514,10 @@ void KernelGenerator::visit(const ir::operation::Gather &node)
+   // Converting in reverse order
+   const int axis = ::onert::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value).value();
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+-  auto indices_alloc = _tensor_builder->at(indices_index).get();
+-  const auto backend_layout = ofm_alloc->layout();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
++  auto indices_tensor = _tensor_builder->at(indices_index).get();
++  const auto backend_layout = ofm_tensor->layout();
+   UNUSED_RELEASE(backend_layout);
+ 
+   // NOTE The frontend layout and backend layout must be the same for this operation.
+@@ -575,35 +528,35 @@ void KernelGenerator::visit(const ir::operation::Gather &node)
+   //      a model. For example, if a model in NHWC has this operation as output rank == 4, indices
+   //      rank == 2 and axis == 2, this operation should work as the axis W and C, but the axis W
+   //      and C are not sequential in NCHW. So the backend in NCHW cannot handle this case.
+-  assert(backend_layout == ifm_alloc->layout());
+-  assert(backend_layout == indices_alloc->layout());
++  assert(backend_layout == ifm_tensor->layout());
++  assert(backend_layout == indices_tensor->layout());
+   assert(ifm_rank < 4 || _current_op_seq_layout == backend_layout);
+ 
+   auto fn = std::make_unique<::arm_compute::NEGatherEx>();
+ 
+   // input is n-D, indices k-D, output is (n + k - 1)-D
+   size_t n = ifm_rank;
+-  assert(n == ifm_alloc->num_dimensions());
++  assert(n == ifm_tensor->num_dimensions());
+   size_t k = _ctx.at(indices_index).shape().rank();
+-  assert(k == indices_alloc->num_dimensions());
++  assert(k == indices_tensor->num_dimensions());
+ 
+   // Disable applied dim_correction
+-  if (n != ifm_alloc->info()->num_dimensions())
++  if (n != ifm_tensor->info()->num_dimensions())
+   {
+     // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
+     const auto ifm = _ctx.at(ifm_index);
+-    ifm_alloc->info()->set_tensor_shape(
++    ifm_tensor->info()->set_tensor_shape(
+         acl_common::asTensorShape(ifm.shape(), _current_op_seq_layout, backend_layout, false));
+   }
+-  if (k != indices_alloc->info()->num_dimensions())
++  if (k != indices_tensor->info()->num_dimensions())
+   {
+     // This means that high dimension's value is 1 and indices tensor is applied dim_correction
+     const auto indices = _ctx.at(indices_index);
+-    indices_alloc->info()->set_tensor_shape(
++    indices_tensor->info()->set_tensor_shape(
+         acl_common::asTensorShape(indices.shape(), _current_op_seq_layout, backend_layout, false));
+   }
+ 
+-  fn->configure(ifm_alloc->handle(), indices_alloc->handle(), ofm_alloc->handle(), axis);
++  fn->configure(ifm_tensor->handle(), indices_tensor->handle(), ofm_tensor->handle(), axis);
+ 
+   // acl_neon doesn't not revert disabling applied dim_correction because acl_neon's kernels would
+   // use arm_compute::TensorInfo::offset_element_in_bytes()
+@@ -621,20 +574,20 @@ void KernelGenerator::visit(const ir::operation::InstanceNorm &node)
+   const auto gamma_index{node.getInputs().at(ir::operation::InstanceNorm::Input::GAMMA)};
+   const auto beta_index{node.getInputs().at(ir::operation::InstanceNorm::Input::BETA)};
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+-  auto gamma_alloc = _tensor_builder->at(gamma_index).get();
+-  auto beta_alloc = _tensor_builder->at(beta_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
++  auto gamma_tensor = _tensor_builder->at(gamma_index).get();
++  auto beta_tensor = _tensor_builder->at(beta_index).get();
+   auto epsilon = node.param().epsilon;
+   auto activation = node.param().activation;
+ 
+   auto fn = std::make_unique<::arm_compute::NEInstanceNormalizationLayerEx>();
+ 
+-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), gamma_alloc->handle(),
+-                beta_alloc->handle(), epsilon);
++  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), gamma_tensor->handle(),
++                beta_tensor->handle(), epsilon);
+ 
+   _return_fn = std::make_unique<exec::FunctionSequence>(
+-      asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
++      asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
+ }
+ 
+ void KernelGenerator::visit(const ir::operation::L2Normalization &node)
+@@ -656,15 +609,15 @@ void KernelGenerator::visit(const ir::operation::L2Normalization &node)
+   float beta = 0.5f;                             // pow(reduction, -0.5) = 1 / sqrt(reduction)
+   float bias = 0.0f;                             // Don't offset the reduction.
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+ 
+   const auto norm_info = ::arm_compute::NormalizationLayerInfo(::arm_compute::NormType::CROSS_MAP,
+                                                                radius, alpha, beta, bias, false);
+ 
+   auto fn = std::make_unique<::arm_compute::NENormalizationLayer>();
+ 
+-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), norm_info);
++  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
+ 
+   auto acl_fn = asAclFunction(std::move(fn));
+ 
+@@ -686,19 +639,20 @@ void KernelGenerator::visit(const ir::operation::L2Pool2D &node)
+       ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh);
+   const auto activation = node.param().activation;
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+ 
+   ::arm_compute::PoolingLayerInfo info{
+       ::arm_compute::PoolingType::L2, ::arm_compute::Size2D{kw, kh},
++      ifm_tensor->info()->data_layout(),
+       ::onert::backend::acl_common::asPadStrideInfo(padding, stride)};
+ 
+   auto fn = std::make_unique<::arm_compute::NEPoolingLayer>();
+ 
+-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info);
++  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), info);
+ 
+   _return_fn = std::make_unique<exec::FunctionSequence>(
+-      asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
++      asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
+ }
+ 
+ void KernelGenerator::visit(const ir::operation::LocalResponseNormalization &node)
+@@ -712,15 +666,15 @@ void KernelGenerator::visit(const ir::operation::LocalResponseNormalization &nod
+   auto beta = node.param().beta;
+   auto bias = node.param().bias;
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+ 
+   const auto norm_info = ::arm_compute::NormalizationLayerInfo(
+       ::arm_compute::NormType::CROSS_MAP, radius * 2 + 1, alpha, beta, bias, false);
+ 
+   auto fn = std::make_unique<::arm_compute::NENormalizationLayer>();
+ 
+-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), norm_info);
++  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
+ 
+   auto acl_fn = asAclFunction(std::move(fn));
+ 
+@@ -733,13 +687,13 @@ void KernelGenerator::visit(const ir::operation::LogicalAnd &node)
+   const auto input0_index{node.getInputs().at(ir::operation::LogicalAnd::Input::INPUT0)};
+   const auto input1_index{node.getInputs().at(ir::operation::LogicalAnd::Input::INPUT1)};
+ 
+-  auto output_alloc = _tensor_builder->at(output_index).get();
+-  auto input0_alloc = _tensor_builder->at(input0_index).get();
+-  auto input1_alloc = _tensor_builder->at(input1_index).get();
++  auto output_tensor = _tensor_builder->at(output_index).get();
++  auto input0_tensor = _tensor_builder->at(input0_index).get();
++  auto input1_tensor = _tensor_builder->at(input1_index).get();
+ 
+   auto fn = std::make_unique<::arm_compute::NELogicalAnd>();
+ 
+-  fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle());
++  fn->configure(input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle());
+ 
+   auto acl_fn = asAclFunction(std::move(fn));
+ 
+@@ -751,12 +705,12 @@ void KernelGenerator::visit(const ir::operation::LogicalNot &node)
+   const auto output_index{node.getOutputs().at(0)};
+   const auto input_index{node.getInputs().at(ir::operation::LogicalNot::Input::INPUT)};
+ 
+-  auto output_alloc = _tensor_builder->at(output_index).get();
+-  auto input_alloc = _tensor_builder->at(input_index).get();
++  auto output_tensor = _tensor_builder->at(output_index).get();
++  auto input_tensor = _tensor_builder->at(input_index).get();
+ 
+   auto fn = std::make_unique<::arm_compute::NEBitwiseNot>();
+ 
+-  fn->configure(input_alloc->handle(), output_alloc->handle());
++  fn->configure(input_tensor->handle(), output_tensor->handle());
+ 
+   auto acl_fn = asAclFunction(std::move(fn));
+ 
+@@ -769,13 +723,13 @@ void KernelGenerator::visit(const ir::operation::LogicalOr &node)
+   const auto input0_index{node.getInputs().at(ir::operation::LogicalOr::Input::INPUT0)};
+   const auto input1_index{node.getInputs().at(ir::operation::LogicalOr::Input::INPUT1)};
+ 
+-  auto output_alloc = _tensor_builder->at(output_index).get();
+-  auto input0_alloc = _tensor_builder->at(input0_index).get();
+-  auto input1_alloc = _tensor_builder->at(input1_index).get();
++  auto output_tensor = _tensor_builder->at(output_index).get();
++  auto input0_tensor = _tensor_builder->at(input0_index).get();
++  auto input1_tensor = _tensor_builder->at(input1_index).get();
+ 
+   auto fn = std::make_unique<::arm_compute::NELogicalOr>();
+ 
+-  fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle());
++  fn->configure(input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle());
+ 
+   auto acl_fn = asAclFunction(std::move(fn));
+ 
+@@ -787,8 +741,8 @@ void KernelGenerator::visit(const ir::operation::Logistic &node)
+   const auto ofm_index{node.getOutputs().at(0)};
+   const auto ifm_index{node.getInputs().at(ir::operation::Logistic::Input::INPUT)};
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+ 
+   const ::arm_compute::ActivationLayerInfo act_info{
+       ::arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC};
+@@ -798,7 +752,7 @@ void KernelGenerator::visit(const ir::operation::Logistic &node)
+   // instead of 'INF', and then the result of this op will be errors due to the 'NaN'.
+   auto fn = std::make_unique<::arm_compute::NEActivationLayerEx>();
+ 
+-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info);
++  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), act_info);
+ 
+   auto acl_fn = asAclFunction(std::move(fn));
+ 
+@@ -807,159 +761,8 @@ void KernelGenerator::visit(const ir::operation::Logistic &node)
+ 
+ void KernelGenerator::visit(const ir::operation::LSTM &node)
+ {
+-  // TODO Support dynamic rnn
+-  // TODO Fix subtle error in the case of non-CIFG, non-peephole and No Projection.
+-  const auto scratch_buffer_index{
+-      node.getOutputs().at(ir::operation::LSTM::Output::SCRATCH_BUFFER)};
+-  const auto output_state_out_index{
+-      node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT_STATE_OUT)};
+-  const auto cell_state_out_index{
+-      node.getOutputs().at(ir::operation::LSTM::Output::CELL_STATE_OUT)};
+-  const auto output_index{node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT)};
+-
+-  const auto input_index{node.getInputs().at(ir::operation::LSTM::Input::INPUT)};
+-  const auto input_to_input_weights_index{
+-      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_INPUT_WEIGHTS)}; // optional
+-  const auto input_to_forget_weights_index{
+-      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_FORGET_WEIGHTS)};
+-  const auto input_to_cell_weights_index{
+-      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_CELL_WEIGHTS)};
+-  const auto input_to_output_weights_index{
+-      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_OUTPUT_WEIGHTS)};
+-  const auto recurrent_to_input_weights_index{
+-      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_INPUT_WEIGHTS)}; // optional
+-  const auto recurrent_to_forget_weights_index{
+-      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_FORGET_WEIGHTS)};
+-  const auto recurrent_to_cell_weights_index{
+-      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_CELL_WEIGHTS)};
+-  const auto recurrent_to_output_weights_index{
+-      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_OUTPUT_WEIGHTS)};
+-  const auto cell_to_input_weights_index{
+-      node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_INPUT_WEIGHTS)}; // optional
+-  const auto cell_to_forget_weights_index{
+-      node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_FORGET_WEIGHTS)}; // optional
+-  const auto cell_to_output_weights_index{
+-      node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_OUTPUT_WEIGHTS)}; // optional
+-  const auto input_gate_bias_index{
+-      node.getInputs().at(ir::operation::LSTM::Input::INPUT_GATE_BIAS)};
+-  const auto forget_gate_bias_index{
+-      node.getInputs().at(ir::operation::LSTM::Input::FORGET_GATE_BIAS)};
+-  const auto cell_bias_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_BIAS)};
+-  const auto output_gate_bias_index{
+-      node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_GATE_BIAS)};
+-  const auto projection_weights_index{
+-      node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_WEIGHTS)}; // optional
+-  const auto projection_bias_index{
+-      node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_BIAS)}; // optional
+-  const auto output_state_in_index{
+-      node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_STATE_IN)};
+-  const auto cell_state_in_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_STATE_IN)};
+-  const auto cell_threshold = node.param().cell_threshold;
+-  const auto projection_threshold = node.param().projection_threshold;
+-
+-  bool has_input_to_input_weights = _ctx.at(input_to_input_weights_index).shape().dim(0) != 0 &&
+-                                    _ctx.at(input_to_input_weights_index).shape().dim(1) != 0;
+-  bool has_recurrent_to_input_weights =
+-      _ctx.at(recurrent_to_input_weights_index).shape().dim(0) != 0 &&
+-      _ctx.at(recurrent_to_input_weights_index).shape().dim(1) != 0;
+-  bool has_cell_to_forget_weights = _ctx.at(cell_to_forget_weights_index).shape().dim(0) != 0;
+-  bool has_cell_to_output_weights = _ctx.at(cell_to_output_weights_index).shape().dim(0) != 0;
+-  bool has_projection_weights = _ctx.at(projection_weights_index).shape().dim(0) != 0 &&
+-                                _ctx.at(projection_weights_index).shape().dim(1) != 0;
+-  bool has_projection_bias = _ctx.at(projection_bias_index).shape().dim(0);
+-
+-  // NOTE The input_to_input_weights and the recurrent_to_input_weights do not exist in CIFG.
+-  // true: no CIFG
+-  // false: CIFG
+-  // NOTE The cell_to_input_weights does not exist in non-peephole although regular LSTM(non-CIFG).
+-  bool has_cifg_param = has_input_to_input_weights && has_recurrent_to_input_weights;
+-
+-  // NOTE The cell_to_forget_weights and the cell_to_output_weights exist in peephole.
+-  // But the cell_to_input_weights does not exist in regular CIFG although peephole.
+-  // true: peephole
+-  // false: no peephole
+-  bool has_peephole_param = has_cell_to_forget_weights && has_cell_to_output_weights;
+-
+-  // NOTE Although the projection weights has data the projection bias may not have data.
+-  bool has_projection_param = has_projection_weights;
+-
+-  const auto activation = node.param().activation;
+-  const auto cell_clip = cell_threshold;
+-  const auto projection_clip = projection_threshold;
+-  assert(cell_clip >= 0.f && projection_clip >= 0.f);
+-
+-  auto scratch_buffer_alloc = _tensor_builder->at(scratch_buffer_index).get();
+-  auto output_state_out_alloc = _tensor_builder->at(output_state_out_index).get();
+-  auto cell_state_out_alloc = _tensor_builder->at(cell_state_out_index).get();
+-  auto output_alloc = _tensor_builder->at(output_index).get();
+-
+-  auto input_alloc = _tensor_builder->at(input_index).get();
+-
+-  auto input_to_forget_weights_alloc = _tensor_builder->at(input_to_forget_weights_index).get();
+-  auto input_to_cell_weights_alloc = _tensor_builder->at(input_to_cell_weights_index).get();
+-  auto input_to_output_weights_alloc = _tensor_builder->at(input_to_output_weights_index).get();
+-  auto recurrent_to_forget_weights_alloc =
+-      _tensor_builder->at(recurrent_to_forget_weights_index).get();
+-  auto recurrent_to_cell_weights_alloc = _tensor_builder->at(recurrent_to_cell_weights_index).get();
+-  auto recurrent_to_output_weights_alloc =
+-      _tensor_builder->at(recurrent_to_output_weights_index).get();
+-
+-  auto forget_gate_bias_alloc = _tensor_builder->at(forget_gate_bias_index).get();
+-  auto cell_bias_alloc = _tensor_builder->at(cell_bias_index).get();
+-  auto output_gate_bias_alloc = _tensor_builder->at(output_gate_bias_index).get();
+-  auto output_state_in_alloc = _tensor_builder->at(output_state_in_index).get();
+-  auto cell_state_in_alloc = _tensor_builder->at(cell_state_in_index).get();
+-
+-  auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation);
+-
+-  auto fn = std::make_unique<::arm_compute::NELSTMLayer>();
+-
+-  ::arm_compute::LSTMParams<::arm_compute::ITensor> lstm_params{};
+-  if (has_cifg_param)
+-  {
+-    auto input_to_input_weights_alloc =
+-        _tensor_builder->at(input_to_input_weights_index).get(); // optional
+-    auto recurrent_to_input_weights_alloc =
+-        _tensor_builder->at(recurrent_to_input_weights_index).get(); // optional
+-    auto cell_to_input_weights_handle =
+-        has_peephole_param ? _tensor_builder->at(cell_to_input_weights_index).get()->handle()
+-                           : nullptr; // optional (non-cifg && peephole)
+-    auto input_gate_bias_alloc = _tensor_builder->at(input_gate_bias_index).get(); // optional
+-    lstm_params.set_cifg_params(input_to_input_weights_alloc->handle(),
+-                                recurrent_to_input_weights_alloc->handle(),
+-                                cell_to_input_weights_handle, input_gate_bias_alloc->handle());
+-  }
+-  if (has_peephole_param)
+-  {
+-    auto cell_to_forget_weights_alloc =
+-        _tensor_builder->at(cell_to_forget_weights_index).get(); // optional
+-    auto cell_to_output_weights_alloc =
+-        _tensor_builder->at(cell_to_output_weights_index).get(); // optional
+-    lstm_params.set_peephole_params(cell_to_forget_weights_alloc->handle(),
+-                                    cell_to_output_weights_alloc->handle());
+-  }
+-  if (has_projection_param)
+-  {
+-    auto projection_weights_alloc = _tensor_builder->at(projection_weights_index).get(); // optional
+-    auto projection_bias_handle = has_projection_bias
+-                                      ? _tensor_builder->at(projection_bias_index).get()->handle()
+-                                      : nullptr; // optional
+-    lstm_params.set_projection_params(projection_weights_alloc->handle(), projection_bias_handle);
+-  }
+-
+-  fn->configure(
+-      input_alloc->handle(), input_to_forget_weights_alloc->handle(),
+-      input_to_cell_weights_alloc->handle(), input_to_output_weights_alloc->handle(),
+-      recurrent_to_forget_weights_alloc->handle(), recurrent_to_cell_weights_alloc->handle(),
+-      recurrent_to_output_weights_alloc->handle(), forget_gate_bias_alloc->handle(),
+-      cell_bias_alloc->handle(), output_gate_bias_alloc->handle(), output_state_in_alloc->handle(),
+-      cell_state_in_alloc->handle(), scratch_buffer_alloc->handle(),
+-      output_state_out_alloc->handle(), cell_state_out_alloc->handle(), output_alloc->handle(),
+-      lstm_params, act_info, cell_clip, projection_clip);
+-
+-  auto acl_fn = asAclFunction(std::move(fn));
+-
+-  _return_fn = std::move(acl_fn);
++  _return_fn = acl_common::kernelGenLSTM<acl_common::AclFunction, ::arm_compute::ITensor,
++                                         ::arm_compute::NELSTMLayer>(node, _ctx, _tensor_builder);
+ }
+ 
+ void KernelGenerator::visit(const ir::operation::Mul &node)
+@@ -970,18 +773,18 @@ void KernelGenerator::visit(const ir::operation::Mul &node)
+ 
+   const auto activation = node.param().activation;
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto lhs_alloc = _tensor_builder->at(lhs_index).get();
+-  auto rhs_alloc = _tensor_builder->at(rhs_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto lhs_tensor = _tensor_builder->at(lhs_index).get();
++  auto rhs_tensor = _tensor_builder->at(rhs_index).get();
+ 
+   auto fn = std::make_unique<::arm_compute::NEPixelWiseMultiplication>();
+ 
+   // RoundingPolicy for scale:1.0 is only allowed RoundingPolicy::TO_ZERO
+-  fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(), 1.0, // scale
++  fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), 1.0, // scale
+                 arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_ZERO);
+ 
+   _return_fn = std::make_unique<exec::FunctionSequence>(
+-      asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
++      asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
+ }
+ 
+ void KernelGenerator::visit(const ir::operation::Neg &node)
+@@ -989,12 +792,12 @@ void KernelGenerator::visit(const ir::operation::Neg &node)
+   const auto ofm_index{node.getOutputs().at(0)};
+   const auto ifm_index{node.getInputs().at(ir::operation::Neg::Input::INPUT)};
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+ 
+   auto fn = std::make_unique<::arm_compute::NENegLayer>();
+ 
+-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle());
++  fn->configure(ifm_tensor->handle(), ofm_tensor->handle());
+ 
+   auto acl_fn = asAclFunction(std::move(fn));
+ 
+@@ -1030,12 +833,12 @@ void KernelGenerator::visit(const ir::operation::Pack &node)
+   for (const auto &input_index : input_indexes)
+   {
+     size_t input_rank = _ctx.at(input_index).shape().rank();
+-    const auto &input_alloc = _tensor_builder->at(input_index);
+-    assert(input_rank == input_alloc->num_dimensions());
+-    if (input_rank != input_alloc->info()->num_dimensions())
++    const auto &input_tensor = _tensor_builder->at(input_index);
++    assert(input_rank == input_tensor->num_dimensions());
++    if (input_rank != input_tensor->info()->num_dimensions())
+     {
+       // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
+-      input_alloc->info()->set_tensor_shape(acl_common::asTensorShape(
++      input_tensor->info()->set_tensor_shape(acl_common::asTensorShape(
+           _ctx.at(input_index).shape(), _current_op_seq_layout, backend_layout, false));
+     }
+   }
+@@ -1094,8 +897,8 @@ void KernelGenerator::visit(const ir::operation::Permute &node)
+   const auto ofm_idx{node.getOutputs().at(0)};
+   const auto ifm_idx{node.getInputs().at(0)};
+   const auto permute_type = node.getPermuteType();
+-  auto ofm_alloc = _tensor_builder->at(ofm_idx).get();
+-  auto ifm_alloc = _tensor_builder->at(ifm_idx).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_idx).get();
++  auto ifm_tensor = _tensor_builder->at(ifm_idx).get();
+   const auto rank = _ctx.at(ofm_idx).shape().rank();
+   assert(_ctx.at(ifm_idx).shape().rank() == _ctx.at(ofm_idx).shape().rank());
+ 
+@@ -1108,7 +911,7 @@ void KernelGenerator::visit(const ir::operation::Permute &node)
+ 
+     auto l = std::make_unique<::arm_compute::NEPermute>();
+ 
+-    l->configure(ifm_alloc->handle(), ofm_alloc->handle(), pv);
++    l->configure(ifm_tensor->handle(), ofm_tensor->handle(), pv);
+ 
+     fn = std::move(l);
+   }
+@@ -1119,7 +922,7 @@ void KernelGenerator::visit(const ir::operation::Permute &node)
+ 
+     auto l = std::make_unique<::arm_compute::NEPermute>();
+ 
+-    l->configure(ifm_alloc->handle(), ofm_alloc->handle(), pv);
++    l->configure(ifm_tensor->handle(), ofm_tensor->handle(), pv);
+ 
+     fn = std::move(l);
+   }
+@@ -1127,7 +930,7 @@ void KernelGenerator::visit(const ir::operation::Permute &node)
+   {
+     auto l = std::make_unique<::arm_compute::NECopy>();
+ 
+-    l->configure(ifm_alloc->handle(), ofm_alloc->handle());
++    l->configure(ifm_tensor->handle(), ofm_tensor->handle());
+ 
+     fn = std::move(l);
+   }
+@@ -1143,15 +946,15 @@ void KernelGenerator::visit(const ir::operation::PReLU &node)
+   const auto ifm_index{node.getInputs().at(ir::operation::PReLU::Input::INPUT)};
+   const auto alpha_index{node.getInputs().at(ir::operation::PReLU::Input::ALPHA)};
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+-  auto alpha_alloc = _tensor_builder->at(alpha_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
++  auto alpha_tensor = _tensor_builder->at(alpha_index).get();
+ 
+   std::unique_ptr<::arm_compute::IFunction> fn;
+ 
+-  auto l = std::make_unique<::arm_compute::NEPReLU>();
++  auto l = std::make_unique<::arm_compute::NEPReluLayer>();
+ 
+-  l->configure(ifm_alloc->handle(), alpha_alloc->handle(), ofm_alloc->handle());
++  l->configure(ifm_tensor->handle(), alpha_tensor->handle(), ofm_tensor->handle());
+ 
+   fn = std::move(l);
+ 
+@@ -1166,14 +969,14 @@ void KernelGenerator::visit(const ir::operation::Reduce &node)
+   const auto input_index{node.getInputs().at(ir::operation::Reduce::Input::INPUT)};
+   const auto axes_index{node.getInputs().at(ir::operation::Reduce::Input::AXES)};
+ 
+-  auto output_alloc = _tensor_builder->at(output_index).get();
+-  auto input_alloc = _tensor_builder->at(input_index).get();
++  auto output_tensor = _tensor_builder->at(output_index).get();
++  auto input_tensor = _tensor_builder->at(input_index).get();
+ 
+   // Convert to ACL axes taking into account negative values and possible duplicates.
+   const auto &axes = _ctx.at(axes_index);
+   const auto input_rank = _ctx.at(input_index).shape().rank();
+   const auto frontend_layout = _current_op_seq_layout;
+-  const auto backend_layout = input_alloc->layout();
++  const auto backend_layout = input_tensor->layout();
+   const auto reduce_axes =
+       acl_common::asCoordinates(axes, input_rank, frontend_layout, backend_layout);
+   const auto reduce_type = node.param().reduce_type;
+@@ -1182,11 +985,9 @@ void KernelGenerator::visit(const ir::operation::Reduce &node)
+   std::unique_ptr<::arm_compute::IFunction> fn;
+   if (reduce_type == ir::operation::Reduce::ReduceType::MEAN)
+   {
+-    // NOTE NEReduceMean has a bug that does not support NHWC layout
+-    //      NEReduceMean intermediate tensors are always NCHW layout
+-    auto l = std::make_unique<::arm_compute::NEReduceMeanEx>();
++    auto l = std::make_unique<::arm_compute::NEReduceMean>();
+ 
+-    l->configure(input_alloc->handle(), reduce_axes, keep_dims, output_alloc->handle());
++    l->configure(input_tensor->handle(), reduce_axes, keep_dims, output_tensor->handle());
+ 
+     fn = std::move(l);
+   }
+@@ -1194,7 +995,7 @@ void KernelGenerator::visit(const ir::operation::Reduce &node)
+   {
+     auto l = std::make_unique<::arm_compute::NEReduceSum>();
+ 
+-    l->configure(input_alloc->handle(), reduce_axes, keep_dims, output_alloc->handle());
++    l->configure(input_tensor->handle(), reduce_axes, keep_dims, output_tensor->handle());
+ 
+     fn = std::move(l);
+   }
+@@ -1202,7 +1003,7 @@ void KernelGenerator::visit(const ir::operation::Reduce &node)
+   {
+     auto l = std::make_unique<::arm_compute::NEReduceOperation>();
+ 
+-    l->configure(input_alloc->handle(), reduce_axes, keep_dims, output_alloc->handle(),
++    l->configure(input_tensor->handle(), reduce_axes, keep_dims, output_tensor->handle(),
+                  acl_common::convertReduceType(reduce_type));
+ 
+     fn = std::move(l);
+@@ -1218,15 +1019,15 @@ void KernelGenerator::visit(const ir::operation::ReLU &node)
+   const auto output_index{node.getOutputs().at(0)};
+   const auto input_index{node.getInputs().at(ir::operation::ReLU::Input::INPUT)};
+ 
+-  auto output_alloc = _tensor_builder->at(output_index).get();
+-  auto input_alloc = _tensor_builder->at(input_index).get();
++  auto output_tensor = _tensor_builder->at(output_index).get();
++  auto input_tensor = _tensor_builder->at(input_index).get();
+ 
+   auto fn = std::make_unique<arm_compute::NEActivationLayer>();
+ 
+   const ::arm_compute::ActivationLayerInfo act_info{
+       ::arm_compute::ActivationLayerInfo::ActivationFunction::RELU};
+ 
+-  fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
++  fn->configure(input_tensor->handle(), output_tensor->handle(), act_info);
+ 
+   auto acl_fn = asAclFunction(std::move(fn));
+ 
+@@ -1238,15 +1039,15 @@ void KernelGenerator::visit(const ir::operation::ReLU1 &node)
+   const auto ofm_index{node.getOutputs().at(0)};
+   const auto ifm_index{node.getInputs().at(ir::operation::ReLU1::Input::INPUT)};
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+ 
+   const ::arm_compute::ActivationLayerInfo act_info{
+       ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1.0f, -1.0f};
+ 
+   auto fn = std::make_unique<::arm_compute::NEActivationLayer>();
+ 
+-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info);
++  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), act_info);
+ 
+   auto acl_fn = asAclFunction(std::move(fn));
+ 
+@@ -1258,15 +1059,15 @@ void KernelGenerator::visit(const ir::operation::ReLU6 &node)
+   const auto ofm_index{node.getOutputs().at(0)};
+   const auto ifm_index{node.getInputs().at(ir::operation::ReLU6::Input::INPUT)};
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+ 
+   const ::arm_compute::ActivationLayerInfo act_info{
+       ::arm_compute::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.0f};
+ 
+   auto fn = std::make_unique<::arm_compute::NEActivationLayer>();
+ 
+-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info);
++  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), act_info);
+ 
+   auto acl_fn = asAclFunction(std::move(fn));
+ 
+@@ -1278,13 +1079,13 @@ void KernelGenerator::visit(const ir::operation::Reshape &node)
+   const auto output_index{node.getOutputs().at(0)};
+   const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)};
+ 
+-  auto output_alloc = _tensor_builder->at(output_index).get();
+-  auto input_alloc = _tensor_builder->at(input_index).get();
++  auto output_tensor = _tensor_builder->at(output_index).get();
++  auto input_tensor = _tensor_builder->at(input_index).get();
+ 
+   // NOTE This operation must not be changed the layout from frontend to backend
+   //      So, PermutationOperationPass makes layouts of frontend and backend the same.
+   const auto frontend_layout = _current_op_seq_layout;
+-  const auto backend_layout = output_alloc->layout();
++  const auto backend_layout = output_tensor->layout();
+   assert((_ctx.at(input_index).shape().rank() < 4 && _ctx.at(output_index).shape().rank() < 4) ||
+          frontend_layout == backend_layout);
+   UNUSED_RELEASE(frontend_layout);
+@@ -1292,7 +1093,7 @@ void KernelGenerator::visit(const ir::operation::Reshape &node)
+ 
+   auto fn = std::make_unique<arm_compute::NEReshapeLayer>();
+ 
+-  fn->configure(input_alloc->handle(), output_alloc->handle());
++  fn->configure(input_tensor->handle(), output_tensor->handle());
+ 
+   auto acl_fn = asAclFunction(std::move(fn));
+ 
+@@ -1305,12 +1106,12 @@ void KernelGenerator::visit(const ir::operation::ResizeBilinear &node)
+ 
+   const auto ifm_index{node.getInputs().at(ir::operation::ResizeBilinear::Input::INPUT)};
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+ 
+   auto fn = std::make_unique<::arm_compute::NEScale>();
+ 
+-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(),
++  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(),
+                 ::arm_compute::InterpolationPolicy::BILINEAR, ::arm_compute::BorderMode::REPLICATE,
+                 ::arm_compute::PixelValue(0.f), ::arm_compute::SamplingPolicy::TOP_LEFT);
+ 
+@@ -1334,25 +1135,25 @@ void KernelGenerator::visit(const ir::operation::RNN &node)
+ 
+   const auto activation = node.param().activation;
+ 
+-  auto output_alloc = _tensor_builder->at(output_index).get();
+-  auto hidden_state_out_alloc = _tensor_builder->at(hidden_state_out_index).get();
++  auto output_tensor = _tensor_builder->at(output_index).get();
++  auto hidden_state_out_tensor = _tensor_builder->at(hidden_state_out_index).get();
+ 
+-  auto input_alloc = _tensor_builder->at(input_index).get();
+-  auto weights_alloc = _tensor_builder->at(weights_index).get();
+-  auto recurrent_weights_alloc = _tensor_builder->at(recurrent_weights_index).get();
+-  auto bias_alloc = _tensor_builder->at(bias_index).get();
+-  auto hidden_state_in_alloc = _tensor_builder->at(hidden_state_in_index).get();
++  auto input_tensor = _tensor_builder->at(input_index).get();
++  auto weights_tensor = _tensor_builder->at(weights_index).get();
++  auto recurrent_weights_tensor = _tensor_builder->at(recurrent_weights_index).get();
++  auto bias_tensor = _tensor_builder->at(bias_index).get();
++  auto hidden_state_in_tensor = _tensor_builder->at(hidden_state_in_index).get();
+   auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation);
+ 
+   auto copy_layer = std::make_unique<::arm_compute::NECopy>();
+-  copy_layer->configure(hidden_state_in_alloc->handle(), hidden_state_out_alloc->handle());
++  copy_layer->configure(hidden_state_in_tensor->handle(), hidden_state_out_tensor->handle());
+   _return_fn = asAclFunction(std::move(copy_layer));
+ 
+-  auto fn = std::make_unique<::arm_compute::NERNNLayerEx>(
++  auto fn = std::make_unique<::arm_compute::NERNNLayer>(
+       _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
+-  fn->configure(input_alloc->handle(), weights_alloc->handle(), recurrent_weights_alloc->handle(),
+-                bias_alloc->handle(), hidden_state_out_alloc->handle(), output_alloc->handle(),
+-                act_info);
++  fn->configure(input_tensor->handle(), weights_tensor->handle(),
++                recurrent_weights_tensor->handle(), bias_tensor->handle(),
++                hidden_state_out_tensor->handle(), output_tensor->handle(), act_info);
+   _return_fn = asAclFunction(std::move(fn));
+ }
+ 
+@@ -1361,12 +1162,12 @@ void KernelGenerator::visit(const ir::operation::RSQRT &node)
+   const auto ofm_index{node.getOutputs().at(0)};
+   const auto ifm_index{node.getInputs().at(ir::operation::RSQRT::Input::INPUT)};
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+ 
+   auto fn = std::make_unique<::arm_compute::NERsqrtLayer>();
+ 
+-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle());
++  fn->configure(ifm_tensor->handle(), ofm_tensor->handle());
+ 
+   _return_fn = asAclFunction(std::move(fn));
+ }
+@@ -1383,10 +1184,10 @@ void KernelGenerator::visit(const ir::operation::Squeeze &node)
+   (void)dims;
+   (void)ndim;
+ 
+-  auto output_alloc = _tensor_builder->at(output_index).get();
+-  auto input_alloc = _tensor_builder->at(input_index).get();
++  auto output_tensor = _tensor_builder->at(output_index).get();
++  auto input_tensor = _tensor_builder->at(input_index).get();
+   auto fn = std::make_unique<arm_compute::NEReshapeLayer>();
+-  fn->configure(input_alloc->handle(), output_alloc->handle());
++  fn->configure(input_tensor->handle(), output_tensor->handle());
+   auto acl_fn = asAclFunction(std::move(fn));
+   _return_fn = std::move(acl_fn);
+ }
+@@ -1396,15 +1197,15 @@ void KernelGenerator::visit(const ir::operation::Tanh &node)
+   const auto output_index{node.getOutputs().at(0)};
+   const auto input_index{node.getInputs().at(ir::operation::Tanh::Input::INPUT)};
+ 
+-  auto output_alloc = _tensor_builder->at(output_index).get();
+-  auto input_alloc = _tensor_builder->at(input_index).get();
++  auto output_tensor = _tensor_builder->at(output_index).get();
++  auto input_tensor = _tensor_builder->at(input_index).get();
+ 
+   auto fn = std::make_unique<arm_compute::NEActivationLayer>();
+ 
+   const ::arm_compute::ActivationLayerInfo act_info{
+       ::arm_compute::ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f};
+ 
+-  fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
++  fn->configure(input_tensor->handle(), output_tensor->handle(), act_info);
+ 
+   auto acl_fn = asAclFunction(std::move(fn));
+ 
+@@ -1417,13 +1218,25 @@ void KernelGenerator::visit(const ir::operation::Softmax &node)
+   const auto input_index{node.getInputs().at(ir::operation::Softmax::Input::INPUT)};
+   const auto beta = node.param().beta;
+ 
+-  auto output_alloc = _tensor_builder->at(output_index).get();
+-  auto input_alloc = _tensor_builder->at(input_index).get();
++  auto output_tensor = _tensor_builder->at(output_index).get();
++  auto input_tensor = _tensor_builder->at(input_index).get();
++  const auto frontend_layout = _current_op_seq_layout;
++  const auto backend_layout = input_tensor->layout();
++
++  // Disable applied dim_correction
++  const size_t input_rank = _ctx.at(input_index).shape().rank();
++  if (input_rank != input_tensor->info()->num_dimensions())
++  {
++    // This means that high dimension's value is 1 and input tensor is applied dim_correction
++    const auto input = _ctx.at(input_index);
++    input_tensor->info()->set_tensor_shape(
++        acl_common::asTensorShape(input.shape(), frontend_layout, backend_layout, false));
++  }
+ 
+   auto fn = std::make_unique<::arm_compute::NESoftmaxLayer>(
+       _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
+ 
+-  fn->configure(input_alloc->handle(), output_alloc->handle(), beta);
++  fn->configure(input_tensor->handle(), output_tensor->handle(), beta);
+ 
+   auto acl_fn = asAclFunction(std::move(fn));
+ 
+@@ -1438,20 +1251,18 @@ void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node)
+       node.getInputs().at(ir::operation::SpaceToBatchND::Input::BLOCK_SIZE)};
+   const auto paddings_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::PADDINGS)};
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+-  auto block_size_alloc = _tensor_builder->at(block_size_index).get();
+-  auto paddings_alloc = _tensor_builder->at(paddings_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
++  auto block_size_tensor = _tensor_builder->at(block_size_index).get();
++  auto paddings_tensor = _tensor_builder->at(paddings_index).get();
+ 
+   assert(_ctx.at(block_size_index).data());
+   assert(_ctx.at(paddings_index).data());
+ 
+-  // NESpaceToBatchLayer has a bug that padding's values are 0 even when zero point of QASYMM8 is
+-  // not 0.
+-  auto fn = std::make_unique<::arm_compute::NESpaceToBatchLayerEx>();
++  auto fn = std::make_unique<::arm_compute::NESpaceToBatchLayer>();
+ 
+-  fn->configure(ifm_alloc->handle(), block_size_alloc->handle(), paddings_alloc->handle(),
+-                ofm_alloc->handle());
++  fn->configure(ifm_tensor->handle(), block_size_tensor->handle(), paddings_tensor->handle(),
++                ofm_tensor->handle());
+ 
+   auto acl_fn = asAclFunction(std::move(fn));
+ 
+@@ -1465,12 +1276,12 @@ void KernelGenerator::visit(const ir::operation::SpaceToDepth &node)
+ 
+   auto block_size = node.param().block_size;
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+ 
+-  auto fn = std::make_unique<::arm_compute::NESpaceToDepthLayerEx>();
++  auto fn = std::make_unique<::arm_compute::NESpaceToDepthLayer>();
+ 
+-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), block_size);
++  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), block_size);
+ 
+   auto acl_fn = asAclFunction(std::move(fn));
+ 
+@@ -1489,13 +1300,13 @@ void KernelGenerator::visit(const ir::operation::Split &node)
+   for (const auto &output : node.getOutputs())
+     output_indexes.emplace_back(output);
+ 
+-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+-  std::vector<arm_compute::ITensor *> output_allocs;
++  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
++  std::vector<arm_compute::ITensor *> output_tensors;
+   for (const auto &ofm_ind : output_indexes)
+-    output_allocs.emplace_back(_tensor_builder->at(ofm_ind).get()->handle());
++    output_tensors.emplace_back(_tensor_builder->at(ofm_ind).get()->handle());
+ 
+   const auto frontend_layout = _current_op_seq_layout;
+-  const auto backend_layout = ifm_alloc->layout();
++  const auto backend_layout = ifm_tensor->layout();
+   auto axis = node.param().axis;
+   if (axis < 0)
+     axis += ifm_rank;
+@@ -1503,7 +1314,7 @@ void KernelGenerator::visit(const ir::operation::Split &node)
+ 
+   auto fn = std::make_unique<::arm_compute::NESplit>();
+ 
+-  fn->configure(ifm_alloc->handle(), output_allocs, axis);
++  fn->configure(ifm_tensor->handle(), output_tensors, axis);
+ 
+   _return_fn = asAclFunction(std::move(fn));
+ }
+@@ -1513,15 +1324,15 @@ void KernelGenerator::visit(const ir::operation::SQRT &node)
+   const auto output_index{node.getOutputs().at(0)};
+   const auto input_index{node.getInputs().at(ir::operation::SQRT::Input::INPUT)};
+ 
+-  auto output_alloc = _tensor_builder->at(output_index).get();
+-  auto input_alloc = _tensor_builder->at(input_index).get();
++  auto output_tensor = _tensor_builder->at(output_index).get();
++  auto input_tensor = _tensor_builder->at(input_index).get();
+ 
+   const ::arm_compute::ActivationLayerInfo act_info{
+       ::arm_compute::ActivationLayerInfo::ActivationFunction::SQRT};
+ 
+   auto fn = std::make_unique<::arm_compute::NEActivationLayer>();
+ 
+-  fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
++  fn->configure(input_tensor->handle(), output_tensor->handle(), act_info);
+ 
+   auto acl_fn = asAclFunction(std::move(fn));
+ 
+@@ -1534,13 +1345,13 @@ void KernelGenerator::visit(const ir::operation::SquaredDifference &node)
+   const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)};
+   const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)};
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto lhs_alloc = _tensor_builder->at(lhs_index).get();
+-  auto rhs_alloc = _tensor_builder->at(rhs_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto lhs_tensor = _tensor_builder->at(lhs_index).get();
++  auto rhs_tensor = _tensor_builder->at(rhs_index).get();
+ 
+   auto fn = std::make_unique<::arm_compute::NEElementwiseSquaredDiff>();
+ 
+-  fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
++  fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
+ 
+   auto acl_fn = asAclFunction(std::move(fn));
+ 
+@@ -1555,17 +1366,17 @@ void KernelGenerator::visit(const ir::operation::Sub &node)
+ 
+   const auto activation = node.param().activation;
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto lhs_alloc = _tensor_builder->at(lhs_index).get();
+-  auto rhs_alloc = _tensor_builder->at(rhs_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto lhs_tensor = _tensor_builder->at(lhs_index).get();
++  auto rhs_tensor = _tensor_builder->at(rhs_index).get();
+ 
+   auto fn = std::make_unique<::arm_compute::NEArithmeticSubtraction>();
+ 
+-  fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(),
++  fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
+                 arm_compute::ConvertPolicy::SATURATE);
+ 
+   _return_fn = std::make_unique<exec::FunctionSequence>(
+-      asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
++      asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
+ }
+ 
+ void KernelGenerator::visit(const ir::operation::Slice &node)
+@@ -1575,10 +1386,10 @@ void KernelGenerator::visit(const ir::operation::Slice &node)
+   const auto begins_index{node.getInputs().at(ir::operation::Slice::Input::BEGINS)};
+   const auto sizes_index{node.getInputs().at(ir::operation::Slice::Input::SIZES)};
+ 
+-  auto outputData_alloc = _tensor_builder->at(output_index).get();
+-  auto inputData_alloc = _tensor_builder->at(input_index).get();
++  auto outputData_tensor = _tensor_builder->at(output_index).get();
++  auto inputData_tensor = _tensor_builder->at(input_index).get();
+   const auto frontend_layout = _current_op_seq_layout;
+-  const auto backend_layout = inputData_alloc->layout();
++  const auto backend_layout = inputData_tensor->layout();
+ 
+   // Set initializers for indices data such as order of inputData
+   int input_rank = _ctx.at(input_index).shape().rank();
+@@ -1628,7 +1439,7 @@ void KernelGenerator::visit(const ir::operation::Slice &node)
+ 
+   auto fn = std::make_unique<::arm_compute::NESlice>();
+ 
+-  fn->configure(inputData_alloc->handle(), outputData_alloc->handle(), starts_set, ends_set);
++  fn->configure(inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set);
+ 
+   auto acl_fn = asAclFunction(std::move(fn));
+ 
+@@ -1643,10 +1454,10 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node)
+   const auto ends_index{node.getInputs().at(ir::operation::StridedSlice::Input::ENDS)};
+   const auto strides_index{node.getInputs().at(ir::operation::StridedSlice::Input::STRIDES)};
+ 
+-  auto outputData_alloc = _tensor_builder->at(output_index).get();
+-  auto inputData_alloc = _tensor_builder->at(input_index).get();
++  auto outputData_tensor = _tensor_builder->at(output_index).get();
++  auto inputData_tensor = _tensor_builder->at(input_index).get();
+   const auto frontend_layout = _current_op_seq_layout;
+-  const auto backend_layout = inputData_alloc->layout();
++  const auto backend_layout = inputData_tensor->layout();
+ 
+   // Set initializers for indices data such as order of inputData
+   int input_rank = _ctx.at(input_index).shape().rank();
+@@ -1715,7 +1526,7 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node)
+ 
+   auto fn = std::make_unique<::arm_compute::NEStridedSlice>();
+ 
+-  fn->configure(inputData_alloc->handle(), outputData_alloc->handle(), starts_set, ends_set,
++  fn->configure(inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set,
+                 strides_set, begin_mask, end_mask, shrink_axis_mask);
+ 
+   auto acl_fn = asAclFunction(std::move(fn));
+@@ -1749,16 +1560,16 @@ void KernelGenerator::visit(const ir::operation::TransposeConv &node)
+     invalid_vertical = ofm_shape.H - (1 + (ifm_shape.H - 1) * stride.vertical) - (ker_shape.H - 1);
+   }
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+-  auto ker_alloc = _tensor_builder->at(ker_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
++  auto ker_tensor = _tensor_builder->at(ker_index).get();
+ 
+   const auto tconv_info = acl_common::asPadStrideInfo(padding, stride);
+ 
+   auto fn = std::make_unique<::arm_compute::NETransposeConvLayer>();
+ 
+-  fn->configure(ifm_alloc->handle(), ker_alloc->handle(), nullptr, ofm_alloc->handle(), tconv_info,
+-                invalid_horizontal, invalid_vertical);
++  fn->configure(ifm_tensor->handle(), ker_tensor->handle(), nullptr, ofm_tensor->handle(),
++                tconv_info, invalid_horizontal, invalid_vertical);
+ 
+   auto acl_fn = asAclFunction(std::move(fn));
+ 
+@@ -1771,10 +1582,10 @@ void KernelGenerator::visit(const ir::operation::Transpose &node)
+   const auto ifm_idx{node.getInputs().at(ir::operation::Transpose::Input::INPUT)};
+   const auto &perm{node.param().perm};
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_idx).get();
+-  const auto ifm_alloc = _tensor_builder->at(ifm_idx).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_idx).get();
++  const auto ifm_tensor = _tensor_builder->at(ifm_idx).get();
+   const auto frontend_layout = _current_op_seq_layout;
+-  const auto backend_layout = ifm_alloc->layout();
++  const auto backend_layout = ifm_tensor->layout();
+ 
+   const auto rank = _ctx.at(ifm_idx).shape().rank();
+   std::vector<std::int32_t> pv(perm.cbegin(), perm.cend());
+@@ -1783,11 +1594,11 @@ void KernelGenerator::visit(const ir::operation::Transpose &node)
+ 
+   std::unique_ptr<::arm_compute::IFunction> fn;
+ 
+-  if (ifm_alloc->num_dimensions() <= 2 && ofm_alloc->num_dimensions() <= 2)
++  if (ifm_tensor->num_dimensions() <= 2 && ofm_tensor->num_dimensions() <= 2)
+   {
+     auto l = std::make_unique<::arm_compute::NETranspose>();
+ 
+-    l->configure(ifm_alloc->handle(), ofm_alloc->handle());
++    l->configure(ifm_tensor->handle(), ofm_tensor->handle());
+ 
+     fn = std::move(l);
+   }
+@@ -1795,7 +1606,7 @@ void KernelGenerator::visit(const ir::operation::Transpose &node)
+   {
+     auto l = std::make_unique<::arm_compute::NEPermute>();
+ 
+-    l->configure(ifm_alloc->handle(), ofm_alloc->handle(), backend_pv);
++    l->configure(ifm_tensor->handle(), ofm_tensor->handle(), backend_pv);
+ 
+     fn = std::move(l);
+   }
+@@ -1834,13 +1645,13 @@ void KernelGenerator::visit(const ir::operation::Unpack &node)
+   for (const auto &output_index : output_indexes)
+   {
+     size_t output_rank = _ctx.at(output_index).shape().rank();
+-    const auto &output_alloc = _tensor_builder->at(output_index);
+-    orig_outputs_acl_tensor_shapes.emplace_back(output_alloc->info()->tensor_shape());
+-    assert(output_rank == output_alloc->num_dimensions());
+-    if (output_rank != output_alloc->info()->num_dimensions())
++    const auto &output_tensor = _tensor_builder->at(output_index);
++    orig_outputs_acl_tensor_shapes.emplace_back(output_tensor->info()->tensor_shape());
++    assert(output_rank == output_tensor->num_dimensions());
++    if (output_rank != output_tensor->info()->num_dimensions())
+     {
+       // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
+-      output_alloc->info()->set_tensor_shape(acl_common::asTensorShape(
++      output_tensor->info()->set_tensor_shape(acl_common::asTensorShape(
+           _ctx.at(output_index).shape(), _current_op_seq_layout, backend_layout, false));
+     }
+   }
+@@ -1858,17 +1669,17 @@ void KernelGenerator::visit(const ir::operation::Add &node)
+ 
+   const auto activation = node.param().activation;
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto lhs_alloc = _tensor_builder->at(lhs_index).get();
+-  auto rhs_alloc = _tensor_builder->at(rhs_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto lhs_tensor = _tensor_builder->at(lhs_index).get();
++  auto rhs_tensor = _tensor_builder->at(rhs_index).get();
+ 
+   auto fn = std::make_unique<::arm_compute::NEArithmeticAddition>();
+ 
+-  fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(),
++  fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
+                 arm_compute::ConvertPolicy::SATURATE);
+ 
+   _return_fn = std::make_unique<exec::FunctionSequence>(
+-      asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
++      asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
+ }
+ 
+ void KernelGenerator::visit(const ir::operation::Div &node)
+@@ -1879,16 +1690,16 @@ void KernelGenerator::visit(const ir::operation::Div &node)
+ 
+   const auto activation = node.param().activation;
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto lhs_alloc = _tensor_builder->at(lhs_index).get();
+-  auto rhs_alloc = _tensor_builder->at(rhs_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto lhs_tensor = _tensor_builder->at(lhs_index).get();
++  auto rhs_tensor = _tensor_builder->at(rhs_index).get();
+ 
+   auto fn = std::make_unique<::arm_compute::NEElementwiseDivision>();
+ 
+-  fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
++  fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
+ 
+   _return_fn = std::make_unique<exec::FunctionSequence>(
+-      asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
++      asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
+ }
+ 
+ void KernelGenerator::visit(const ir::operation::Exp &node)
+@@ -1896,12 +1707,12 @@ void KernelGenerator::visit(const ir::operation::Exp &node)
+   const auto output_index{node.getOutputs().at(0)};
+   const auto input_index{node.getInputs().at(ir::operation::Exp::Input::INPUT)};
+ 
+-  auto output_alloc = _tensor_builder->at(output_index).get();
+-  auto input_alloc = _tensor_builder->at(input_index).get();
++  auto output_tensor = _tensor_builder->at(output_index).get();
++  auto input_tensor = _tensor_builder->at(input_index).get();
+ 
+   auto fn = std::make_unique<::arm_compute::NEExpLayer>();
+ 
+-  fn->configure(input_alloc->handle(), output_alloc->handle());
++  fn->configure(input_tensor->handle(), output_tensor->handle());
+ 
+   auto acl_fn = asAclFunction(std::move(fn));
+ 
+@@ -1913,12 +1724,12 @@ void KernelGenerator::visit(const ir::operation::ExpandDims &node)
+   const auto output_index{node.getOutputs().at(0)};
+   const auto input_index{node.getInputs().at(ir::operation::ExpandDims::Input::INPUT)};
+ 
+-  auto output_alloc = _tensor_builder->at(output_index).get();
+-  auto input_alloc = _tensor_builder->at(input_index).get();
++  auto output_tensor = _tensor_builder->at(output_index).get();
++  auto input_tensor = _tensor_builder->at(input_index).get();
+ 
+   auto fn = std::make_unique<::arm_compute::NEReshapeLayer>();
+ 
+-  fn->configure(input_alloc->handle(), output_alloc->handle());
++  fn->configure(input_tensor->handle(), output_tensor->handle());
+ 
+   auto acl_fn = asAclFunction(std::move(fn));
+ 
+@@ -1933,13 +1744,13 @@ void KernelGenerator::visit(const ir::operation::Comparison &node)
+ 
+   const auto comparison_type = node.param().comparison_type;
+ 
+-  auto output_alloc = _tensor_builder->at(output_index).get();
+-  auto input0_alloc = _tensor_builder->at(input0_index).get();
+-  auto input1_alloc = _tensor_builder->at(input1_index).get();
++  auto output_tensor = _tensor_builder->at(output_index).get();
++  auto input0_tensor = _tensor_builder->at(input0_index).get();
++  auto input1_tensor = _tensor_builder->at(input1_index).get();
+ 
+   auto fn = std::make_unique<::arm_compute::NEElementwiseComparison>();
+ 
+-  fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle(),
++  fn->configure(input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle(),
+                 (arm_compute::ComparisonOperation)comparison_type);
+ 
+   auto acl_fn = asAclFunction(std::move(fn));
+@@ -1953,13 +1764,13 @@ void KernelGenerator::visit(const ir::operation::Min &node)
+   const auto lhs_index{node.getInputs().at(ir::operation::Min::Input::LHS)};
+   const auto rhs_index{node.getInputs().at(ir::operation::Min::Input::RHS)};
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto lhs_alloc = _tensor_builder->at(lhs_index).get();
+-  auto rhs_alloc = _tensor_builder->at(rhs_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto lhs_tensor = _tensor_builder->at(lhs_index).get();
++  auto rhs_tensor = _tensor_builder->at(rhs_index).get();
+ 
+   auto fn = std::make_unique<::arm_compute::NEElementwiseMin>();
+ 
+-  fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
++  fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
+ 
+   auto acl_fn = asAclFunction(std::move(fn));
+ 
+@@ -1972,13 +1783,13 @@ void KernelGenerator::visit(const ir::operation::Max &node)
+   const auto lhs_index{node.getInputs().at(ir::operation::Max::Input::LHS)};
+   const auto rhs_index{node.getInputs().at(ir::operation::Max::Input::RHS)};
+ 
+-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+-  auto lhs_alloc = _tensor_builder->at(lhs_index).get();
+-  auto rhs_alloc = _tensor_builder->at(rhs_index).get();
++  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
++  auto lhs_tensor = _tensor_builder->at(lhs_index).get();
++  auto rhs_tensor = _tensor_builder->at(rhs_index).get();
+ 
+   auto fn = std::make_unique<::arm_compute::NEElementwiseMax>();
+ 
+-  fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
++  fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
+ 
+   auto acl_fn = asAclFunction(std::move(fn));
+ 
+diff --git a/runtime/onert/backend/cpu/ConstantInitializer.cc b/runtime/onert/backend/cpu/ConstantInitializer.cc
+index 71e3136..deb27f0 100644
+--- a/runtime/onert/backend/cpu/ConstantInitializer.cc
++++ b/runtime/onert/backend/cpu/ConstantInitializer.cc
+@@ -15,6 +15,7 @@
+  */
+ 
+ #include "ConstantInitializer.h"
++#include "Tensor.h"
+ 
+ namespace onert
+ {
+@@ -30,39 +31,61 @@ ConstantInitializer::ConstantInitializer(const ir::Operands &operands,
+   // DO NOTHING
+ }
+ 
++void ConstantInitializer::registerDefaultInitializer(const ir::OperandIndex &index,
++                                                     const ir::Operand &obj)
++{
++  registerExternalInitializer(index, obj);
++}
++
++void ConstantInitializer::registerExternalInitializer(const ir::OperandIndex &index,
++                                                      const ir::Operand &obj)
++{
++  // For only CONSTANTS
++  // TODO Add to check if tensor has been allocated
++  if (!obj.isConstant())
++    return;
++
++  _init_map[index] = [](const onert::ir::Operand &model_obj, onert::backend::ITensor &itensor) {
++    auto data = model_obj.shareData();
++    assert(data && data->base());
++    ExternalTensor &tensor = dynamic_cast<ExternalTensor &>(itensor);
++    tensor.setData(data);
++  };
++}
++
+ void ConstantInitializer::visit(const ir::operation::Conv2D &node)
+ {
+   const auto &kernel_index = node.getInputs().at(ir::operation::Conv2D::KERNEL);
+   const auto &kernel_obj = _operands.at(kernel_index);
+-  registerCopyInitializer(kernel_index, kernel_obj);
++  registerExternalInitializer(kernel_index, kernel_obj);
+ 
+   const auto &bias_index = node.getInputs().at(ir::operation::Conv2D::BIAS);
+   const auto &bias_obj = _operands.at(bias_index);
+-  registerCopyInitializer(bias_index, bias_obj);
++  registerExternalInitializer(bias_index, bias_obj);
+ }
+ 
+ void ConstantInitializer::visit(const ir::operation::DepthwiseConv2D &node)
+ {
+   const auto &kernel_index = node.getInputs().at(ir::operation::DepthwiseConv2D::KERNEL);
+   const auto &kernel_obj = _operands.at(kernel_index);
+-  registerCopyInitializer(kernel_index, kernel_obj);
++  registerExternalInitializer(kernel_index, kernel_obj);
+ 
+   const auto &bias_index = node.getInputs().at(ir::operation::DepthwiseConv2D::BIAS);
+   const auto &bias_obj = _operands.at(bias_index);
+-  registerCopyInitializer(bias_index, bias_obj);
++  registerExternalInitializer(bias_index, bias_obj);
+ }
+ 
+ void ConstantInitializer::visit(const ir::operation::FullyConnected &node)
+ {
+   const auto &weight_index = node.getInputs().at(ir::operation::FullyConnected::WEIGHT);
+   const auto &weight_obj = _operands.at(weight_index);
+-  registerCopyInitializer(weight_index, weight_obj);
++  registerExternalInitializer(weight_index, weight_obj);
+ 
+   const auto &bias_index = node.getInputs().at(ir::operation::FullyConnected::BIAS);
+   if (!bias_index.undefined())
+   {
+     const auto &bias_obj = _operands.at(bias_index);
+-    registerCopyInitializer(bias_index, bias_obj);
++    registerExternalInitializer(bias_index, bias_obj);
+   }
+ }
+ 
+diff --git a/runtime/onert/backend/cpu/ConstantInitializer.h b/runtime/onert/backend/cpu/ConstantInitializer.h
+index bd06c64..de03a69 100644
+--- a/runtime/onert/backend/cpu/ConstantInitializer.h
++++ b/runtime/onert/backend/cpu/ConstantInitializer.h
+@@ -36,6 +36,15 @@ public:
+                       const std::shared_ptr<TensorBuilder> &tensor_builder);
+ 
+ public:
++  void registerDefaultInitializer(const ir::OperandIndex &index, const ir::Operand &obj) override;
++
++  // TODO: For now the only cpu backend supports constant tensor to use data from external
++  // If the other backend supports (to do this,
++  // ExternalTensor should be abstract such as IExternal, maybe),
++  // this can be an interface of IConstantInitializer
++  void registerExternalInitializer(const ir::OperandIndex &, const ir::Operand &);
++
++public:
+   void visit(const ir::operation::Conv2D &) override;
+   void visit(const ir::operation::DepthwiseConv2D &) override;
+   void visit(const ir::operation::FullyConnected &) override;
+diff --git a/runtime/onert/backend/cpu/KernelGenerator.cc b/runtime/onert/backend/cpu/KernelGenerator.cc
+index 72f9606..2766aa2 100644
+--- a/runtime/onert/backend/cpu/KernelGenerator.cc
++++ b/runtime/onert/backend/cpu/KernelGenerator.cc
+@@ -60,6 +60,7 @@
+ #include "ops/SoftMaxLayer.h"
+ #include "ops/StridedSliceLayer.h"
+ #include "ops/SpaceToBatchNDLayer.h"
++#include "ops/SpaceToDepthLayer.h"
+ #include "ops/SplitLayer.h"
+ #include "ops/SubLayer.h"
+ #include "ops/TanhLayer.h"
+@@ -70,11 +71,13 @@
+ #include "ops/ZerosLikeLayer.h"
+ #include "ops/SquaredDiffLayer.h"
+ #include "ops/LogicalOrLayer.h"
++#include "ops/L2NormLayer.h"
+ #include "ops/MatrixBandPartLayer.h"
+ #include "ops/BatchMatMulLayer.h"
+ #include "ops/BroadcastToLayer.h"
+ #include "ops/FusedBatchNormLayer.h"
+ #include "ops/LogSoftMaxLayer.h"
++#include "ops/QuantizeLayer.h"
+ 
+ #include <backend/Backend.h>
+ #include <backend/IConfig.h>
+@@ -184,10 +187,10 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node)
+   const auto ker_index{node.getInputs().at(Conv2D::Input::KERNEL)};
+   const auto bias_index{node.getInputs().at(Conv2D::Input::BIAS)};
+ 
+-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
+-  auto ker_alloc = _tensor_builder->portableAt(ker_index).get();
+-  auto bias_alloc = _tensor_builder->portableAt(bias_index).get();
++  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
++  auto ker_tensor = _tensor_builder->portableAt(ker_index).get();
++  auto bias_tensor = _tensor_builder->portableAt(bias_index).get();
+ 
+   const auto stride = node.param().stride;
+   const auto activation = node.param().activation;
+@@ -196,9 +199,9 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node)
+ 
+   if (_ctx.at(ifm_index).info().isDynamic() || _ctx.at(ker_index).info().isDynamic())
+   {
+-    fn->configure(ifm_alloc, ker_alloc, bias_alloc, param_padding.type, param_padding.param.left,
++    fn->configure(ifm_tensor, ker_tensor, bias_tensor, param_padding.type, param_padding.param.left,
+                   param_padding.param.right, param_padding.param.top, param_padding.param.bottom,
+-                  stride.horizontal, stride.vertical, activation, ofm_alloc);
++                  stride.horizontal, stride.vertical, activation, ofm_tensor);
+ 
+     _return_fn = std::move(fn);
+     return;
+@@ -213,9 +216,9 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node)
+   const auto padding =
+       ir::calculatePadding(param_padding, ifm_shape, ofm_shape, stride, ker_width, ker_height);
+ 
+-  fn->configure(ifm_alloc, ker_alloc, bias_alloc, param_padding.type, padding.left, padding.right,
+-                padding.top, padding.bottom, stride.horizontal, stride.vertical, activation,
+-                ofm_alloc);
++  fn->configure(ifm_tensor, ker_tensor, bias_tensor, param_padding.type, padding.left,
++                padding.right, padding.top, padding.bottom, stride.horizontal, stride.vertical,
++                activation, ofm_tensor);
+ 
+   _return_fn = std::move(fn);
+ }
+@@ -241,16 +244,16 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
+   const auto multiplier = node.param().multiplier;
+   const auto activation = node.param().activation;
+ 
+-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
+-  auto ker_alloc = _tensor_builder->portableAt(ker_index).get();
+-  auto bias_alloc = _tensor_builder->portableAt(bias_index).get();
++  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
++  auto ker_tensor = _tensor_builder->portableAt(ker_index).get();
++  auto bias_tensor = _tensor_builder->portableAt(bias_index).get();
+ 
+   auto fn = std::make_unique<ops::DepthwiseConvolutionLayer>();
+ 
+-  fn->configure(ifm_alloc, ker_alloc, bias_alloc, padding.left, padding.right, padding.top,
++  fn->configure(ifm_tensor, ker_tensor, bias_tensor, padding.left, padding.right, padding.top,
+                 padding.bottom, stride.horizontal, stride.vertical, multiplier, activation,
+-                ofm_alloc);
++                ofm_tensor);
+ 
+   _return_fn = std::move(fn);
+ }
+@@ -270,13 +273,13 @@ void KernelGenerator::visit(const ir::operation::MaxPool2D &node)
+       ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh);
+   const auto activation = node.param().activation;
+ 
+-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
++  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
+ 
+   auto fn = std::make_unique<ops::MaxPoolLayer>();
+ 
+-  fn->configure(ifm_alloc, padding.left, padding.right, padding.top, padding.bottom,
+-                stride.horizontal, stride.vertical, kw, kh, activation, ofm_alloc);
++  fn->configure(ifm_tensor, padding.left, padding.right, padding.top, padding.bottom,
++                stride.horizontal, stride.vertical, kw, kh, activation, ofm_tensor);
+ 
+   _return_fn = std::move(fn);
+ }
+@@ -295,13 +298,13 @@ void KernelGenerator::visit(const ir::operation::AvgPool2D &node)
+       ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh);
+   const auto activation = node.param().activation;
+ 
+-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
++  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
+ 
+   auto fn = std::make_unique<ops::AvgPoolLayer>();
+ 
+-  fn->configure(ifm_alloc, padding.left, padding.right, padding.top, padding.bottom,
+-                stride.horizontal, stride.vertical, kw, kh, activation, ofm_alloc);
++  fn->configure(ifm_tensor, padding.left, padding.right, padding.top, padding.bottom,
++                stride.horizontal, stride.vertical, kw, kh, activation, ofm_tensor);
+ 
+   _return_fn = std::move(fn);
+ }
+@@ -313,7 +316,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
+   const auto rank = _ctx.at(ofm_index).shape().rank();
+   const auto axis = ops::getAxis(rank, node.param().axis, _current_op_seq_layout);
+ 
+-  auto output_alloc = _tensor_builder->portableAt(ofm_index).get();
++  auto output_tensor = _tensor_builder->portableAt(ofm_index).get();
+ 
+   std::vector<const IPortableTensor *> input_tensors;
+   for (auto &ifm_idx : node.getInputs())
+@@ -321,7 +324,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
+ 
+   auto fn = std::make_unique<ops::ConcatLayer>();
+ 
+-  fn->configure(input_tensors, axis, output_alloc);
++  fn->configure(input_tensors, axis, output_tensor);
+ 
+   _return_fn = std::move(fn);
+ }
+@@ -332,13 +335,13 @@ void KernelGenerator::visit(const ir::operation::Fill &node)
+   const auto input_index{node.getInputs().at(ir::operation::Fill::Input::INPUT)};
+   const auto value_index{node.getInputs().at(ir::operation::Fill::Input::VALUE)};
+ 
+-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
+-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
+-  auto value_alloc = _tensor_builder->portableAt(value_index).get();
++  auto output_tensor = _tensor_builder->portableAt(output_index).get();
++  auto input_tensor = _tensor_builder->portableAt(input_index).get();
++  auto value_tensor = _tensor_builder->portableAt(value_index).get();
+ 
+   auto fn = std::make_unique<ops::FillLayer>();
+ 
+-  fn->configure(input_alloc, value_alloc, output_alloc);
++  fn->configure(input_tensor, value_tensor, output_tensor);
+ 
+   _return_fn = std::move(fn);
+ }
+@@ -353,15 +356,15 @@ void KernelGenerator::visit(const ir::operation::FullyConnected &node)
+   const auto bias_index{node.getInputs().at(FullyConnected::Input::BIAS)};
+   const auto activation = node.param().activation;
+ 
+-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
+-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
+-  auto weight_alloc = _tensor_builder->portableAt(weight_index).get();
+-  auto bias_alloc =
++  auto output_tensor = _tensor_builder->portableAt(output_index).get();
++  auto input_tensor = _tensor_builder->portableAt(input_index).get();
++  auto weight_tensor = _tensor_builder->portableAt(weight_index).get();
++  auto bias_tensor =
+       bias_index.undefined() ? nullptr : _tensor_builder->portableAt(bias_index).get();
+ 
+   auto fn = std::make_unique<ops::FullyConnectedLayer>();
+ 
+-  fn->configure(input_alloc, weight_alloc, bias_alloc, activation, output_alloc);
++  fn->configure(input_tensor, weight_tensor, bias_tensor, activation, output_tensor);
+ 
+   _return_fn = std::move(fn);
+ }
+@@ -371,21 +374,21 @@ void KernelGenerator::visit(const ir::operation::Reshape &node)
+   const auto output_index{node.getOutputs().at(0)};
+   const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)};
+ 
+-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
+-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
++  auto output_tensor = _tensor_builder->portableAt(output_index).get();
++  auto input_tensor = _tensor_builder->portableAt(input_index).get();
+ 
+   // optional 2nd input
+-  IPortableTensor *shape_alloc = nullptr;
++  IPortableTensor *shape_tensor = nullptr;
+ 
+   if (node.getInputs().size() == 2)
+   {
+     const auto shape_index{node.getInputs().at(ir::operation::Reshape::Input::SHAPE)};
+-    shape_alloc = _tensor_builder->portableAt(shape_index).get();
++    shape_tensor = _tensor_builder->portableAt(shape_index).get();
+   }
+ 
+   auto fn = std::make_unique<ops::ReshapeLayer>();
+ 
+-  fn->configure(input_alloc, shape_alloc, output_alloc);
++  fn->configure(input_tensor, shape_tensor, output_tensor);
+   _return_fn = std::move(fn);
+ }
+ 
+@@ -394,13 +397,13 @@ void KernelGenerator::visit(const ir::operation::Squeeze &node)
+   const auto output_index{node.getOutputs().at(0)};
+   const auto input_index{node.getInputs().at(ir::operation::Squeeze::Input::INPUT)};
+ 
+-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
+-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
++  auto output_tensor = _tensor_builder->portableAt(output_index).get();
++  auto input_tensor = _tensor_builder->portableAt(input_index).get();
+ 
+   // Squeeze can share same kernel with reshape
+   auto fn = std::make_unique<ops::ReshapeLayer>();
+ 
+-  fn->configure(input_alloc, nullptr, output_alloc);
++  fn->configure(input_tensor, nullptr, output_tensor);
+ 
+   _return_fn = std::move(fn);
+ }
+@@ -412,12 +415,12 @@ void KernelGenerator::visit(const ir::operation::Softmax &node)
+ 
+   const auto beta = node.param().beta;
+ 
+-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
+-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
++  auto output_tensor = _tensor_builder->portableAt(output_index).get();
++  auto input_tensor = _tensor_builder->portableAt(input_index).get();
+ 
+   auto fn = std::make_unique<ops::SoftMaxLayer>();
+ 
+-  fn->configure(input_alloc, beta, output_alloc);
++  fn->configure(input_tensor, beta, output_tensor);
+ 
+   _return_fn = std::move(fn);
+ }
+@@ -430,13 +433,13 @@ void KernelGenerator::visit(const ir::operation::Add &node)
+ 
+   const auto activation = node.param().activation;
+ 
+-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
+-  auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get();
+-  auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get();
++  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
++  auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get();
++  auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get();
+ 
+   auto fn = std::make_unique<ops::AddLayer>();
+ 
+-  fn->configure(lhs_alloc, rhs_alloc, activation, ofm_alloc);
++  fn->configure(lhs_tensor, rhs_tensor, activation, ofm_tensor);
+ 
+   _return_fn = std::move(fn);
+ }
+@@ -447,15 +450,15 @@ void KernelGenerator::visit(const ir::operation::Comparison &node)
+   const auto lhs_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT0)};
+   const auto rhs_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT1)};
+ 
+-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
+-  auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get();
+-  auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get();
++  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
++  auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get();
++  auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get();
+ 
+   auto comparison_type = node.param().comparison_type;
+ 
+   auto fn = std::make_unique<ops::CompareLayer>();
+ 
+-  fn->configure(lhs_alloc, rhs_alloc, comparison_type, ofm_alloc);
++  fn->configure(lhs_tensor, rhs_tensor, comparison_type, ofm_tensor);
+ 
+   _return_fn = std::move(fn);
+ }
+@@ -466,11 +469,11 @@ void KernelGenerator::visit(const ir::operation::Gather &node)
+   const auto input_index{node.getInputs().at(ir::operation::Gather::Input::INPUT)};
+   const auto indices_index{node.getInputs().at(ir::operation::Gather::Input::INDICES)};
+ 
+-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
+-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
+-  auto indices_alloc = _tensor_builder->portableAt(indices_index).get();
++  auto output_tensor = _tensor_builder->portableAt(output_index).get();
++  auto input_tensor = _tensor_builder->portableAt(input_index).get();
++  auto indices_tensor = _tensor_builder->portableAt(indices_index).get();
+ 
+-  const auto backend_layout = output_alloc->layout();
++  const auto backend_layout = output_tensor->layout();
+   UNUSED_RELEASE(backend_layout);
+ 
+   // NOTE The frontend layout and backend layout must be the same for this operation.
+@@ -481,8 +484,8 @@ void KernelGenerator::visit(const ir::operation::Gather &node)
+   //      a model. For example, if a model in NHWC has this operation as output rank == 4, indices
+   //      rank == 2 and axis == 2, this operation should work as the axis W and C, but the axis W
+   //      and C are not sequential in NCHW. So the backend in NCHW cannot handle this case.
+-  assert(backend_layout == input_alloc->layout());
+-  assert(backend_layout == indices_alloc->layout());
++  assert(backend_layout == input_tensor->layout());
++  assert(backend_layout == indices_tensor->layout());
+   const auto &input_shape = _ctx.at(input_index).shape();
+   UNUSED_RELEASE(input_shape);
+   assert(input_shape.rank() < 4 || _current_op_seq_layout == backend_layout);
+@@ -492,7 +495,7 @@ void KernelGenerator::visit(const ir::operation::Gather &node)
+ 
+   auto fn = std::make_unique<ops::GatherLayer>();
+ 
+-  fn->configure(input_alloc, indices_alloc, output_alloc, axis_value);
++  fn->configure(input_tensor, indices_tensor, output_tensor, axis_value);
+ 
+   _return_fn = std::move(fn);
+ }
+@@ -506,13 +509,13 @@ void KernelGenerator::visit(const ir::operation::Sub &node)
+ 
+   const auto activation = node.param().activation;
+ 
+-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
+-  auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get();
+-  auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get();
++  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
++  auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get();
++  auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get();
+ 
+   auto fn = std::make_unique<ops::SubLayer>();
+ 
+-  fn->configure(lhs_alloc, rhs_alloc, activation, ofm_alloc);
++  fn->configure(lhs_tensor, rhs_tensor, activation, ofm_tensor);
+ 
+   _return_fn = std::move(fn);
+ }
+@@ -526,13 +529,13 @@ void KernelGenerator::visit(const ir::operation::Mul &node)
+ 
+   const auto activation = node.param().activation;
+ 
+-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
+-  auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get();
+-  auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get();
++  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
++  auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get();
++  auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get();
+ 
+   auto fn = std::make_unique<ops::MulLayer>();
+ 
+-  fn->configure(lhs_alloc, rhs_alloc, activation, ofm_alloc);
++  fn->configure(lhs_tensor, rhs_tensor, activation, ofm_tensor);
+ 
+   _return_fn = std::move(fn);
+ }
+@@ -547,18 +550,18 @@ void KernelGenerator::visit(const ir::operation::OneHot &node)
+ 
+   const auto axis = node.param().axis;
+ 
+-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
+-  auto indices_alloc = _tensor_builder->portableAt(indices_index).get();
+-  auto depth_alloc = _tensor_builder->portableAt(depth_index).get();
+-  auto onvalue_alloc = _tensor_builder->portableAt(onvalue_index).get();
+-  auto offvalue_alloc = _tensor_builder->portableAt(offvalue_index).get();
++  auto output_tensor = _tensor_builder->portableAt(output_index).get();
++  auto indices_tensor = _tensor_builder->portableAt(indices_index).get();
++  auto depth_tensor = _tensor_builder->portableAt(depth_index).get();
++  auto onvalue_tensor = _tensor_builder->portableAt(onvalue_index).get();
++  auto offvalue_tensor = _tensor_builder->portableAt(offvalue_index).get();
+ 
+-  assert(indices_alloc->data_type() == OperandType::INT32);
+-  assert(axis <= static_cast<int>(indices_alloc->num_dimensions()));
++  assert(indices_tensor->data_type() == OperandType::INT32);
++  assert(axis <= static_cast<int>(indices_tensor->num_dimensions()));
+ 
+   auto fn = std::make_unique<ops::OneHotLayer>();
+ 
+-  fn->configure(indices_alloc, depth_alloc, onvalue_alloc, offvalue_alloc, output_alloc, axis);
++  fn->configure(indices_tensor, depth_tensor, onvalue_tensor, offvalue_tensor, output_tensor, axis);
+ 
+   _return_fn = std::move(fn);
+ }
+@@ -572,13 +575,13 @@ void KernelGenerator::visit(const ir::operation::Div &node)
+ 
+   const auto activation = node.param().activation;
+ 
+-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
+-  auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get();
+-  auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get();
++  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
++  auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get();
++  auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get();
+ 
+   auto fn = std::make_unique<ops::DivLayer>();
+ 
+-  fn->configure(lhs_alloc, rhs_alloc, activation, ofm_alloc);
++  fn->configure(lhs_tensor, rhs_tensor, activation, ofm_tensor);
+ 
+   _return_fn = std::move(fn);
+ }
+@@ -587,16 +590,16 @@ void KernelGenerator::visit(const ir::operation::Einsum &node)
+ {
+   const auto ofm_index{node.getOutputs().at(0)};
+ 
+-  auto output_alloc = _tensor_builder->portableAt(ofm_index).get();
+-  std::vector<const IPortableTensor *> input_allocs;
++  auto output_tensor = _tensor_builder->portableAt(ofm_index).get();
++  std::vector<const IPortableTensor *> input_tensors;
+   for (auto &ifm_idx : node.getInputs())
+-    input_allocs.emplace_back(_tensor_builder->portableAt(ifm_idx).get());
++    input_tensors.emplace_back(_tensor_builder->portableAt(ifm_idx).get());
+ 
+   const auto equation = node.param().equation;
+ 
+   auto fn = std::make_unique<ops::EinsumLayer>();
+ 
+-  fn->configure(input_allocs, equation, output_alloc);
++  fn->configure(input_tensors, equation, output_tensor);
+ 
+   _return_fn = std::move(fn);
+ }
+@@ -605,14 +608,14 @@ void KernelGenerator::visit(const ir::operation::Custom &node)
+ {
+   auto fill_op_info = [&](const ir::OperandIndexSequence &opSeq,
+                           std::vector<custom::TypeInfo> &types,
+-                          std::vector<std::shared_ptr<IPortableTensor>> &allocs) {
++                          std::vector<std::shared_ptr<IPortableTensor>> &tensors) {
+     for (auto &idx : opSeq)
+     {
+       const auto &operand = _ctx.at(idx);
+       // TODO make sure using `_current_op_seq_layout` is correct for custom operations
+       types.emplace_back(custom::TypeInfo{operand.shape(), operand.typeInfo().type()});
+-      auto in_alloc = _tensor_builder->portableAt(idx);
+-      allocs.emplace_back(in_alloc);
++      auto in_tensor = _tensor_builder->portableAt(idx);
++      tensors.emplace_back(in_tensor);
+     }
+   };
+ 
+@@ -634,12 +637,12 @@ void KernelGenerator::visit(const ir::operation::Exp &node)
+   const auto output_index{node.getOutputs().at(0)};
+   const auto input_index{node.getInputs().at(ir::operation::Exp::Input::INPUT)};
+ 
+-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
+-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
++  auto output_tensor = _tensor_builder->portableAt(output_index).get();
++  auto input_tensor = _tensor_builder->portableAt(input_index).get();
+ 
+   auto fn = std::make_unique<ops::ExpLayer>();
+ 
+-  fn->configure(input_alloc, output_alloc);
++  fn->configure(input_tensor, output_tensor);
+ 
+   _return_fn = std::move(fn);
+ }
+@@ -650,13 +653,13 @@ void KernelGenerator::visit(const ir::operation::ExpandDims &node)
+   const auto input_index{node.getInputs().at(ir::operation::ExpandDims::Input::INPUT)};
+   const auto axis_index{node.getInputs().at(ir::operation::ExpandDims::Input::AXIS)};
+ 
+-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
+-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
+-  auto axis_alloc = _tensor_builder->portableAt(axis_index).get();
++  auto output_tensor = _tensor_builder->portableAt(output_index).get();
++  auto input_tensor = _tensor_builder->portableAt(input_index).get();
++  auto axis_tensor = _tensor_builder->portableAt(axis_index).get();
+ 
+   auto fn = std::make_unique<ops::ExpandDimsLayer>();
+ 
+-  fn->configure(input_alloc, axis_alloc, output_alloc);
++  fn->configure(input_tensor, axis_tensor, output_tensor);
+ 
+   _return_fn = std::move(fn);
+ }
+@@ -666,12 +669,12 @@ void KernelGenerator::visit(const ir::operation::Logistic &node)
+   const auto output_index{node.getOutputs().at(0)};
+   const auto input_index{node.getInputs().at(ir::operation::Logistic::Input::INPUT)};
+ 
+-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
+-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
++  auto output_tensor = _tensor_builder->portableAt(output_index).get();
++  auto input_tensor = _tensor_builder->portableAt(input_index).get();
+ 
+   auto fn = std::make_unique<ops::LogisticLayer>();
+ 
+-  fn->configure(input_alloc, output_alloc);
++  fn->configure(input_tensor, output_tensor);
+ 
+   _return_fn = std::move(fn);
+ }
+@@ -681,12 +684,12 @@ void KernelGenerator::visit(const ir::operation::Tanh &node)
+   const auto output_index{node.getOutputs().at(0)};
+   const auto input_index{node.getInputs().at(ir::operation::Tanh::Input::INPUT)};
+ 
+-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
+-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
++  auto output_tensor = _tensor_builder->portableAt(output_index).get();
++  auto input_tensor = _tensor_builder->portableAt(input_index).get();
+ 
+   auto fn = std::make_unique<ops::TanhLayer>();
+ 
+-  fn->configure(input_alloc, output_alloc);
++  fn->configure(input_tensor, output_tensor);
+ 
+   _return_fn = std::move(fn);
+ }
+@@ -700,7 +703,7 @@ void KernelGenerator::visit(const ir::operation::Pack &node)
+ 
+   assert(-rank <= axis && axis < rank);
+ 
+-  auto output_alloc = _tensor_builder->portableAt(ofm_index).get();
++  auto output_tensor = _tensor_builder->portableAt(ofm_index).get();
+ 
+   std::vector<const IPortableTensor *> input_tensors;
+   for (auto &ifm_idx : node.getInputs())
+@@ -708,7 +711,7 @@ void KernelGenerator::visit(const ir::operation::Pack &node)
+ 
+   auto fn = std::make_unique<ops::PackLayer>();
+ 
+-  fn->configure(input_tensors, axis, output_alloc);
++  fn->configure(input_tensors, axis, output_tensor);
+ 
+   _return_fn = std::move(fn);
+ }
+@@ -722,7 +725,7 @@ void KernelGenerator::visit(const ir::operation::Unpack &node)
+ 
+   assert(rank == 0 || (-rank <= axis && axis < rank));
+ 
+-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
++  auto input_tensor = _tensor_builder->portableAt(input_index).get();
+ 
+   std::vector<IPortableTensor *> output_tensors;
+   for (auto &output_idx : node.getOutputs())
+@@ -732,7 +735,7 @@ void KernelGenerator::visit(const ir::operation::Unpack &node)
+ 
+   uint32_t axis_resolved = (axis < 0 ? axis + rank : axis);
+ 
+-  fn->configure(input_alloc, axis_resolved, node.param().num, output_tensors);
++  fn->configure(input_tensor, axis_resolved, node.param().num, output_tensors);
+ 
+   _return_fn = std::move(fn);
+ }
+@@ -751,8 +754,16 @@ void KernelGenerator::visit(const ir::operation::Pad &node)
+ 
+   auto fn = std::make_unique<ops::PadLayer>();
+ 
+-  fn->configure(input, output, pad_base, pad_rank);
++  bool isPadV2 = node.getInputs().size() == 3 ? true : false;
++  const void *value = nullptr;
+ 
++  if (isPadV2)
++  {
++    const auto value_index{node.getInputs().at(ir::operation::Pad::Input::VALUE)};
++    value = reinterpret_cast<const void *>(_ctx.at(value_index).data()->base());
++  }
++
++  fn->configure(input, output, pad_base, pad_rank, value);
+   _return_fn = std::move(fn);
+ }
+ 
+@@ -762,13 +773,13 @@ void KernelGenerator::visit(const ir::operation::Max &node)
+   const auto lhs_index{node.getInputs().at(ir::operation::Max::Input::LHS)};
+   const auto rhs_index{node.getInputs().at(ir::operation::Max::Input::RHS)};
+ 
+-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
+-  auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get();
+-  auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get();
++  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
++  auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get();
++  auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get();
+ 
+   auto fn = std::make_unique<ops::MaxLayer>();
+ 
+-  fn->configure(lhs_alloc, rhs_alloc, ofm_alloc);
++  fn->configure(lhs_tensor, rhs_tensor, ofm_tensor);
+ 
+   _return_fn = std::move(fn);
+ }
+@@ -779,13 +790,13 @@ void KernelGenerator::visit(const ir::operation::Min &node)
+   const auto lhs_index{node.getInputs().at(ir::operation::Min::Input::LHS)};
+   const auto rhs_index{node.getInputs().at(ir::operation::Min::Input::RHS)};
+ 
+-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
+-  auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get();
+-  auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get();
++  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
++  auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get();
++  auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get();
+ 
+   auto fn = std::make_unique<ops::MinLayer>();
+ 
+-  fn->configure(lhs_alloc, rhs_alloc, ofm_alloc);
++  fn->configure(lhs_tensor, rhs_tensor, ofm_tensor);
+ 
+   _return_fn = std::move(fn);
+ }
+@@ -795,12 +806,12 @@ void KernelGenerator::visit(const ir::operation::Cast &node)
+   const auto ofm_index{node.getOutputs().at(0)};
+   const auto ifm_index{node.getInputs().at(ir::operation::Cast::Input::INPUT)};
+ 
+-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
++  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
+ 
+   auto fn = std::make_unique<ops::CastLayer>();
+ 
+-  fn->configure(ifm_alloc, ofm_alloc);
++  fn->configure(ifm_tensor, ofm_tensor);
+ 
+   _return_fn = std::move(fn);
+ }
+@@ -810,12 +821,12 @@ void KernelGenerator::visit(const ir::operation::Transpose &node)
+   const auto output_index{node.getOutputs().at(0)};
+   const auto input_index{node.getInputs().at(ir::operation::Transpose::Input::INPUT)};
+ 
+-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
+-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
++  auto output_tensor = _tensor_builder->portableAt(output_index).get();
++  auto input_tensor = _tensor_builder->portableAt(input_index).get();
+ 
+   auto fn = std::make_unique<ops::TransposeLayer>();
+ 
+-  fn->configure(input_alloc, output_alloc, node.param().perm);
++  fn->configure(input_tensor, output_tensor, node.param().perm);
+ 
+   _return_fn = std::move(fn);
+ }
+@@ -827,15 +838,15 @@ void KernelGenerator::visit(const ir::operation::Reduce &node)
+   const auto axes_index{node.getInputs().at(ir::operation::Reduce::Input::AXES)};
+ 
+   const auto keep_dims = node.param().keep_dims;
+-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
+-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
+-  auto axes_alloc = _tensor_builder->portableAt(axes_index).get();
++  auto output_tensor = _tensor_builder->portableAt(output_index).get();
++  auto input_tensor = _tensor_builder->portableAt(input_index).get();
++  auto axes_tensor = _tensor_builder->portableAt(axes_index).get();
+ 
+   if (node.param().reduce_type == ir::operation::Reduce::ReduceType::MEAN)
+   {
+     auto fn = std::make_unique<ops::MeanLayer>();
+ 
+-    fn->configure(input_alloc, axes_alloc, output_alloc, keep_dims);
++    fn->configure(input_tensor, axes_tensor, output_tensor, keep_dims);
+ 
+     _return_fn = std::move(fn);
+   }
+@@ -844,7 +855,7 @@ void KernelGenerator::visit(const ir::operation::Reduce &node)
+     auto fn = std::make_unique<ops::ReduceLayer>();
+ 
+     const auto reduce_type = convertReduceType(node.param().reduce_type);
+-    fn->configure(input_alloc, axes_alloc, output_alloc, reduce_type, keep_dims);
++    fn->configure(input_tensor, axes_tensor, output_tensor, reduce_type, keep_dims);
+ 
+     _return_fn = std::move(fn);
+   }
+@@ -855,12 +866,12 @@ void KernelGenerator::visit(const ir::operation::ReLU &node)
+   const auto output_index{node.getOutputs().at(0)};
+   const auto input_index{node.getInputs().at(0)};
+ 
+-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
+-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
++  auto output_tensor = _tensor_builder->portableAt(output_index).get();
++  auto input_tensor = _tensor_builder->portableAt(input_index).get();
+ 
+   auto fn = std::make_unique<ops::ReLULayer>();
+ 
+-  fn->configure(input_alloc, output_alloc);
++  fn->configure(input_tensor, output_tensor);
+ 
+   _return_fn = std::move(fn);
+ }
+@@ -872,14 +883,14 @@ void KernelGenerator::visit(const ir::operation::Select &node)
+   const auto true_index{node.getInputs().at(ir::operation::Select::Input::INPUT_TRUE)};
+   const auto false_index{node.getInputs().at(ir::operation::Select::Input::INPUT_FALSE)};
+ 
+-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
+-  auto condition_alloc = _tensor_builder->portableAt(condition_index).get();
+-  auto true_alloc = _tensor_builder->portableAt(true_index).get();
+-  auto false_alloc = _tensor_builder->portableAt(false_index).get();
++  auto output_tensor = _tensor_builder->portableAt(output_index).get();
++  auto condition_tensor = _tensor_builder->portableAt(condition_index).get();
++  auto true_tensor = _tensor_builder->portableAt(true_index).get();
++  auto false_tensor = _tensor_builder->portableAt(false_index).get();
+ 
+   auto fn = std::make_unique<ops::SelectLayer>();
+ 
+-  fn->configure(condition_alloc, true_alloc, false_alloc, output_alloc);
++  fn->configure(condition_tensor, true_tensor, false_tensor, output_tensor);
+ 
+   _return_fn = std::move(fn);
+ }
+@@ -891,14 +902,14 @@ void KernelGenerator::visit(const ir::operation::Slice &node)
+   const auto begins_index{node.getInputs().at(ir::operation::Slice::Input::BEGINS)};
+   const auto sizes_index{node.getInputs().at(ir::operation::Slice::Input::SIZES)};
+ 
+-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
+-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
+-  auto begins_alloc = _tensor_builder->portableAt(begins_index).get();
+-  auto sizes_alloc = _tensor_builder->portableAt(sizes_index).get();
++  auto output_tensor = _tensor_builder->portableAt(output_index).get();
++  auto input_tensor = _tensor_builder->portableAt(input_index).get();
++  auto begins_tensor = _tensor_builder->portableAt(begins_index).get();
++  auto sizes_tensor = _tensor_builder->portableAt(sizes_index).get();
+ 
+   auto fn = std::make_unique<ops::SliceLayer>();
+ 
+-  fn->configure(input_alloc, begins_alloc, sizes_alloc, output_alloc);
++  fn->configure(input_tensor, begins_tensor, sizes_tensor, output_tensor);
+ 
+   _return_fn = std::move(fn);
+ }
+@@ -911,11 +922,11 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node)
+   const auto ends_index{node.getInputs().at(ir::operation::StridedSlice::Input::ENDS)};
+   const auto strides_index{node.getInputs().at(ir::operation::StridedSlice::Input::STRIDES)};
+ 
+-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
+-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
+-  auto starts_alloc = _tensor_builder->portableAt(starts_index).get();
+-  auto ends_alloc = _tensor_builder->portableAt(ends_index).get();
+-  auto strides_alloc = _tensor_builder->portableAt(strides_index).get();
++  auto output_tensor = _tensor_builder->portableAt(output_index).get();
++  auto input_tensor = _tensor_builder->portableAt(input_index).get();
++  auto starts_tensor = _tensor_builder->portableAt(starts_index).get();
++  auto ends_tensor = _tensor_builder->portableAt(ends_index).get();
++  auto strides_tensor = _tensor_builder->portableAt(strides_index).get();
+ 
+   auto begin_mask = node.param().begin_mask;
+   auto end_mask = node.param().end_mask;
+@@ -923,7 +934,7 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node)
+ 
+   auto fn = std::make_unique<ops::StridedSliceLayer>();
+ 
+-  fn->configure(input_alloc, starts_alloc, ends_alloc, strides_alloc, output_alloc, begin_mask,
++  fn->configure(input_tensor, starts_tensor, ends_tensor, strides_tensor, output_tensor, begin_mask,
+                 end_mask, shrink_axis_mask);
+ 
+   _return_fn = std::move(fn);
+@@ -957,12 +968,12 @@ void KernelGenerator::visit(const ir::operation::Abs &node)
+   const auto ofm_index{node.getOutputs().at(0)};
+   const auto ifm_index{node.getInputs().at(ir::operation::Abs::Input::INPUT)};
+ 
+-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
++  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
+ 
+   auto fn = std::make_unique<ops::AbsLayer>();
+ 
+-  fn->configure(ifm_alloc, ofm_alloc);
++  fn->configure(ifm_tensor, ofm_tensor);
+ 
+   _return_fn = std::move(fn);
+ }
+@@ -972,12 +983,12 @@ void KernelGenerator::visit(const ir::operation::Sin &node)
+   const auto ofm_index{node.getOutputs().at(0)};
+   const auto ifm_index{node.getInputs().at(ir::operation::Sin::Input::INPUT)};
+ 
+-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
++  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
+ 
+   auto fn = std::make_unique<ops::SinLayer>();
+ 
+-  fn->configure(ifm_alloc, ofm_alloc);
++  fn->configure(ifm_tensor, ofm_tensor);
+ 
+   _return_fn = std::move(fn);
+ }
+@@ -987,12 +998,12 @@ void KernelGenerator::visit(const ir::operation::Cos &node)
+   const auto ofm_index{node.getOutputs().at(0)};
+   const auto ifm_index{node.getInputs().at(ir::operation::Cos::Input::INPUT)};
+ 
+-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
++  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
+ 
+   auto fn = std::make_unique<ops::CosLayer>();
+ 
+-  fn->configure(ifm_alloc, ofm_alloc);
++  fn->configure(ifm_tensor, ofm_tensor);
+ 
+   _return_fn = std::move(fn);
+ }
+@@ -1002,12 +1013,12 @@ void KernelGenerator::visit(const ir::operation::RSQRT &node)
+   const auto ofm_index{node.getOutputs().at(0)};
+   const auto ifm_index{node.getInputs().at(ir::operation::RSQRT::Input::INPUT)};
+ 
+-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
++  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
+ 
+   auto fn = std::make_unique<ops::RsqrtLayer>();
+ 
+-  fn->configure(ifm_alloc, ofm_alloc);
++  fn->configure(ifm_tensor, ofm_tensor);
+ 
+   _return_fn = std::move(fn);
+ }
+@@ -1017,12 +1028,12 @@ void KernelGenerator::visit(const ir::operation::Shape &node)
+   const auto ofm_index{node.getOutputs().at(0)};
+   const auto ifm_index{node.getInputs().at(ir::operation::Shape::Input::INPUT)};
+ 
+-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
++  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
+ 
+   auto fn = std::make_unique<ops::ShapeLayer>();
+ 
+-  fn->configure(ifm_alloc, ofm_alloc);
++  fn->configure(ifm_tensor, ofm_tensor);
+ 
+   _return_fn = std::move(fn);
+ }
+@@ -1033,13 +1044,13 @@ void KernelGenerator::visit(const ir::operation::Reverse &node)
+   const auto input_index{node.getInputs().at(ir::operation::Reverse::INPUT)};
+   const auto axis_index{node.getInputs().at(ir::operation::Reverse::AXIS)};
+ 
+-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
+-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
+-  auto axis_alloc = _tensor_builder->portableAt(axis_index).get();
++  auto output_tensor = _tensor_builder->portableAt(output_index).get();
++  auto input_tensor = _tensor_builder->portableAt(input_index).get();
++  auto axis_tensor = _tensor_builder->portableAt(axis_index).get();
+ 
+   auto fn = std::make_unique<ops::ReverseLayer>();
+ 
+-  fn->configure(input_alloc, axis_alloc, output_alloc);
++  fn->configure(input_tensor, axis_tensor, output_tensor);
+ 
+   _return_fn = std::move(fn);
+ }
+@@ -1049,12 +1060,12 @@ void KernelGenerator::visit(const ir::operation::Neg &node)
+   const auto ofm_index{node.getOutputs().at(0)};
+   const auto ifm_index{node.getInputs().at(ir::operation::Neg::Input::INPUT)};
+ 
+-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
++  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
+ 
+   auto fn = std::make_unique<ops::NegLayer>();
+ 
+-  fn->configure(ifm_alloc, ofm_alloc);
++  fn->configure(ifm_tensor, ofm_tensor);
+ 
+   _return_fn = std::move(fn);
+ }
+@@ -1066,12 +1077,12 @@ void KernelGenerator::visit(const ir::operation::ArgMax &node)
+ 
+   const auto axis = node.param().axis;
+ 
+-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
+-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
++  auto output_tensor = _tensor_builder->portableAt(output_index).get();
++  auto input_tensor = _tensor_builder->portableAt(input_index).get();
+ 
+   auto fn = std::make_unique<ops::ArgMinMaxLayer>();
+ 
+-  fn->configure(input_alloc, output_alloc, axis, /* is_arg_max */ true);
++  fn->configure(input_tensor, output_tensor, axis, /* is_arg_max */ true);
+ 
+   _return_fn = std::move(fn);
+ }
+@@ -1082,13 +1093,13 @@ void KernelGenerator::visit(const ir::operation::Pow &node)
+   const auto lhs_index{node.getInputs().at(ir::operation::Pow::LHS)};
+   const auto rhs_index{node.getInputs().at(ir::operation::Pow::RHS)};
+ 
+-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
+-  auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get();
+-  auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get();
++  auto output_tensor = _tensor_builder->portableAt(output_index).get();
++  auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get();
++  auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get();
+ 
+   auto fn = std::make_unique<ops::PowLayer>();
+ 
+-  fn->configure(lhs_alloc, rhs_alloc, ir::Activation::NONE, output_alloc);
++  fn->configure(lhs_tensor, rhs_tensor, ir::Activation::NONE, output_tensor);
+ 
+   _return_fn = std::move(fn);
+ }
+@@ -1098,12 +1109,12 @@ void KernelGenerator::visit(const ir::operation::Log &node)
+   const auto ofm_index{node.getOutputs().at(0)};
+   const auto ifm_index{node.getInputs().at(ir::operation::Log::Input::INPUT)};
+ 
+-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
+-  auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
++  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
++  auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
+ 
+   auto fn = std::make_unique<ops::LogLayer>();
+ 
+-  fn->configure(ifm_alloc, ofm_alloc);
++  fn->configure(ifm_tensor, ofm_tensor);
+ 
+   _return_fn = std::move(fn);
+ }
+@@ -1113,12 +1124,12 @@ void KernelGenerator::visit(const ir::operation::Round &node)
+   const auto output_index{node.getOutputs().at(0)};
+   const auto input_index{node.getInputs().at(ir::operation::Round::INPUT)};
+ 
+-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
+-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
++  auto output_tensor = _tensor_builder->portableAt(output_index).get();
++  auto input_tensor = _tensor_builder->portableAt(input_index).get();
+ 
+   auto fn = std::make_unique<ops::RoundLayer>();
+ 
+-  fn->configure(input_alloc, output_alloc);
++  fn->configure(input_tensor, output_tensor);
+ 
+   _return_fn = std::move(fn);
+ }
+@@ -1128,12 +1139,12 @@ void KernelGenerator::visit(const ir::operation::LogicalNot &node)
+   const auto output_index{node.getOutputs().at(0)};
+   const auto input_index{node.getInputs().at(ir::operation::LogicalNot::INPUT)};
+ 
+-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
+-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
++  auto output_tensor = _tensor_builder->portableAt(output_index).get();
++  auto input_tensor = _tensor_builder->portableAt(input_index).get();
+ 
+   auto fn = std::make_unique<ops::LogicalNotLayer>();
+ 
+-  fn->configure(input_alloc, output_alloc);
++  fn->configure(input_tensor, output_tensor);
+ 
+   _return_fn = std::move(fn);
+ }
+@@ -1144,28 +1155,43 @@ void KernelGenerator::visit(const ir::operation::LogicalOr &node)
+   const auto lhs_index{node.getInputs().at(0)};
+   const auto rhs_index{node.getInputs().at(1)};
+ 
+-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
+-  auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get();
+-  auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get();
++  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
++  auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get();
++  auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get();
+ 
+   auto fn = std::make_unique<ops::LogicalOrLayer>();
+ 
+-  fn->configure(lhs_alloc, rhs_alloc, ofm_alloc);
++  fn->configure(lhs_tensor, rhs_tensor, ofm_tensor);
+ 
+   _return_fn = std::move(fn);
+ }
+ 
+-void KernelGenerator::visit(const ir::operation::ZerosLike &node)
++void KernelGenerator::visit(const ir::operation::L2Normalization &node)
+ {
+   const auto output_index{node.getOutputs().at(0)};
+-  const auto input_index{node.getInputs().at(ir::operation::ZerosLike::INPUT)};
++  const auto input_index{node.getInputs().at(0)};
+ 
+   auto output_alloc = _tensor_builder->portableAt(output_index).get();
+   auto input_alloc = _tensor_builder->portableAt(input_index).get();
+ 
+-  auto fn = std::make_unique<ops::ZerosLikeLayer>();
++  auto fn = std::make_unique<ops::L2NormLayer>();
+ 
+   fn->configure(input_alloc, output_alloc);
++
++  _return_fn = std::move(fn);
++}
++
++void KernelGenerator::visit(const ir::operation::ZerosLike &node)
++{
++  const auto output_index{node.getOutputs().at(0)};
++  const auto input_index{node.getInputs().at(ir::operation::ZerosLike::INPUT)};
++
++  auto output_tensor = _tensor_builder->portableAt(output_index).get();
++  auto input_tensor = _tensor_builder->portableAt(input_index).get();
++
++  auto fn = std::make_unique<ops::ZerosLikeLayer>();
++
++  fn->configure(input_tensor, output_tensor);
+   _return_fn = std::move(fn);
+ }
+ 
+@@ -1176,14 +1202,14 @@ void KernelGenerator::visit(const ir::operation::Range &node)
+   const auto limit_index{node.getInputs().at(ir::operation::Range::LIMIT)};
+   const auto delta_index{node.getInputs().at(ir::operation::Range::DELTA)};
+ 
+-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
+-  auto start_alloc = _tensor_builder->portableAt(start_index).get();
+-  auto limit_alloc = _tensor_builder->portableAt(limit_index).get();
+-  auto delta_alloc = _tensor_builder->portableAt(delta_index).get();
++  auto output_tensor = _tensor_builder->portableAt(output_index).get();
++  auto start_tensor = _tensor_builder->portableAt(start_index).get();
++  auto limit_tensor = _tensor_builder->portableAt(limit_index).get();
++  auto delta_tensor = _tensor_builder->portableAt(delta_index).get();
+ 
+   auto fn = std::make_unique<ops::RangeLayer>();
+ 
+-  fn->configure(start_alloc, limit_alloc, delta_alloc, output_alloc);
++  fn->configure(start_tensor, limit_tensor, delta_tensor, output_tensor);
+   _return_fn = std::move(fn);
+ }
+ 
+@@ -1193,13 +1219,13 @@ void KernelGenerator::visit(const ir::operation::SquaredDifference &node)
+   const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)};
+   const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)};
+ 
+-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
+-  auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get();
+-  auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get();
++  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
++  auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get();
++  auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get();
+ 
+   auto fn = std::make_unique<ops::SqDiffLayer>();
+ 
+-  fn->configure(lhs_alloc, rhs_alloc, ofm_alloc);
++  fn->configure(lhs_tensor, rhs_tensor, ofm_tensor);
+   _return_fn = std::move(fn);
+ }
+ 
+@@ -1209,13 +1235,13 @@ void KernelGenerator::visit(const ir::operation::Tile &node)
+   const auto input_index{node.getInputs().at(ir::operation::Tile::INPUT)};
+   const auto multiples_index{node.getInputs().at(ir::operation::Tile::MULTIPLES)};
+ 
+-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
+-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
+-  auto multiples_alloc = _tensor_builder->portableAt(multiples_index).get();
++  auto output_tensor = _tensor_builder->portableAt(output_index).get();
++  auto input_tensor = _tensor_builder->portableAt(input_index).get();
++  auto multiples_tensor = _tensor_builder->portableAt(multiples_index).get();
+ 
+   auto fn = std::make_unique<ops::TileLayer>();
+ 
+-  fn->configure(input_alloc, multiples_alloc, output_alloc);
++  fn->configure(input_tensor, multiples_tensor, output_tensor);
+   _return_fn = std::move(fn);
+ }
+ 
+@@ -1226,14 +1252,14 @@ void KernelGenerator::visit(const ir::operation::MatrixBandPart &node)
+   const auto num_lower_index{node.getInputs().at(ir::operation::MatrixBandPart::NUM_LOWER_DIAG)};
+   const auto num_upper_index{node.getInputs().at(ir::operation::MatrixBandPart::NUM_UPPER_DIAG)};
+ 
+-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
+-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
+-  auto num_lower_alloc = _tensor_builder->portableAt(num_lower_index).get();
+-  auto num_upper_alloc = _tensor_builder->portableAt(num_upper_index).get();
++  auto output_tensor = _tensor_builder->portableAt(output_index).get();
++  auto input_tensor = _tensor_builder->portableAt(input_index).get();
++  auto num_lower_tensor = _tensor_builder->portableAt(num_lower_index).get();
++  auto num_upper_tensor = _tensor_builder->portableAt(num_upper_index).get();
+ 
+   auto fn = std::make_unique<ops::MatrixBandPartLayer>();
+ 
+-  fn->configure(input_alloc, num_lower_alloc, num_upper_alloc, output_alloc);
++  fn->configure(input_tensor, num_lower_tensor, num_upper_tensor, output_tensor);
+   _return_fn = std::move(fn);
+ }
+ 
+@@ -1243,16 +1269,16 @@ void KernelGenerator::visit(const ir::operation::BatchMatMul &node)
+   const auto lhs_index{node.getInputs().at(ir::operation::BatchMatMul::LHS)};
+   const auto rhs_index{node.getInputs().at(ir::operation::BatchMatMul::RHS)};
+ 
+-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
+-  auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get();
+-  auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get();
++  auto output_tensor = _tensor_builder->portableAt(output_index).get();
++  auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get();
++  auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get();
+ 
+   const auto adj_x = node.param().adj_x;
+   const auto adj_y = node.param().adj_y;
+ 
+   auto fn = std::make_unique<ops::BatchMatMulLayer>();
+ 
+-  fn->configure(lhs_alloc, rhs_alloc, adj_x, adj_y, output_alloc);
++  fn->configure(lhs_tensor, rhs_tensor, adj_x, adj_y, output_tensor);
+   _return_fn = std::move(fn);
+ }
+ 
+@@ -1262,13 +1288,13 @@ void KernelGenerator::visit(const ir::operation::BroadcastTo &node)
+   const auto input_index{node.getInputs().at(ir::operation::BroadcastTo::INPUT)};
+   const auto shape_index{node.getInputs().at(ir::operation::BroadcastTo::SHAPE)};
+ 
+-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
+-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
+-  auto shape_alloc = _tensor_builder->portableAt(shape_index).get();
++  auto output_tensor = _tensor_builder->portableAt(output_index).get();
++  auto input_tensor = _tensor_builder->portableAt(input_index).get();
++  auto shape_tensor = _tensor_builder->portableAt(shape_index).get();
+ 
+   auto fn = std::make_unique<ops::BroadcastToLayer>();
+ 
+-  fn->configure(input_alloc, shape_alloc, output_alloc);
++  fn->configure(input_tensor, shape_tensor, output_tensor);
+ 
+   _return_fn = std::move(fn);
+ }
+@@ -1277,10 +1303,10 @@ void KernelGenerator::visit(const ir::operation::FusedBatchNorm &node)
+ {
+   const auto ofm_index{node.getOutputs().at(0)};
+ 
+-  auto output_alloc = _tensor_builder->portableAt(ofm_index).get();
+-  std::vector<const IPortableTensor *> input_allocs;
++  auto output_tensor = _tensor_builder->portableAt(ofm_index).get();
++  std::vector<const IPortableTensor *> input_tensors;
+   for (auto &ifm_idx : node.getInputs())
+-    input_allocs.emplace_back(_tensor_builder->portableAt(ifm_idx).get());
++    input_tensors.emplace_back(_tensor_builder->portableAt(ifm_idx).get());
+ 
+   const auto epsilon = node.param().epsilon;
+   const auto is_training = node.param().is_training;
+@@ -1288,7 +1314,7 @@ void KernelGenerator::visit(const ir::operation::FusedBatchNorm &node)
+ 
+   auto fn = std::make_unique<ops::FusedBatchNormLayer>();
+ 
+-  fn->configure(input_allocs, epsilon, is_training, data_format, output_alloc);
++  fn->configure(input_tensors, epsilon, is_training, data_format, output_tensor);
+ 
+   _return_fn = std::move(fn);
+ }
+@@ -1301,12 +1327,12 @@ void KernelGenerator::visit(const ir::operation::LogSoftmax &node)
+   const auto beta = node.param().beta;
+   const auto axis = node.param().axis;
+ 
+-  auto output_alloc = _tensor_builder->at(output_index).get();
+-  auto input_alloc = _tensor_builder->at(input_index).get();
++  auto output_tensor = _tensor_builder->portableAt(output_index).get();
++  auto input_tensor = _tensor_builder->portableAt(input_index).get();
+ 
+   auto fn = std::make_unique<ops::LogSoftMaxLayer>();
+ 
+-  fn->configure(input_alloc, beta, axis, output_alloc);
++  fn->configure(input_tensor, beta, axis, output_tensor);
+ 
+   _return_fn = std::move(fn);
+ }
+@@ -1318,14 +1344,45 @@ void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node)
+   const auto block_shape_index{node.getInputs().at(ir::operation::SpaceToBatchND::BLOCK_SIZE)};
+   const auto padding_index{node.getInputs().at(ir::operation::SpaceToBatchND::PADDINGS)};
+ 
+-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
+-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
+-  auto block_shape_alloc = _tensor_builder->portableAt(block_shape_index).get();
+-  auto padding_alloc = _tensor_builder->portableAt(padding_index).get();
++  auto output_tensor = _tensor_builder->portableAt(output_index).get();
++  auto input_tensor = _tensor_builder->portableAt(input_index).get();
++  auto block_shape_tensor = _tensor_builder->portableAt(block_shape_index).get();
++  auto padding_tensor = _tensor_builder->portableAt(padding_index).get();
+ 
+   auto fn = std::make_unique<ops::SpaceToBatchNDLayer>();
+ 
+-  fn->configure(input_alloc, block_shape_alloc, padding_alloc, output_alloc);
++  fn->configure(input_tensor, block_shape_tensor, padding_tensor, output_tensor);
++
++  _return_fn = std::move(fn);
++}
++
++void KernelGenerator::visit(const ir::operation::Quantize &node)
++{
++  const auto input_index{node.getInputs().at(ir::operation::Quantize::Input::INPUT)};
++  const auto output_index{node.getOutputs().at(0)};
++
++  auto input_tensor = _tensor_builder->portableAt(input_index).get();
++  auto output_tensor = _tensor_builder->portableAt(output_index).get();
++
++  auto fn = std::make_unique<ops::QuantizeLayer>();
++
++  fn->configure(input_tensor, output_tensor);
++
++  _return_fn = std::move(fn);
++}
++
++void KernelGenerator::visit(const ir::operation::SpaceToDepth &node)
++{
++  const auto input_index{node.getInputs().at(ir::operation::SpaceToDepth::Input::INPUT)};
++  const auto output_index{node.getOutputs().at(0)};
++  auto block_size = node.param().block_size;
++
++  auto input_tensor = _tensor_builder->portableAt(input_index).get();
++  auto output_tensor = _tensor_builder->portableAt(output_index).get();
++
++  auto fn = std::make_unique<ops::SpaceToDepthLayer>();
++
++  fn->configure(input_tensor, block_size, output_tensor);
+ 
+   _return_fn = std::move(fn);
+ }
+diff --git a/runtime/onert/backend/cpu/KernelGenerator.h b/runtime/onert/backend/cpu/KernelGenerator.h
+index d6f4c28..f564bf8 100644
+--- a/runtime/onert/backend/cpu/KernelGenerator.h
++++ b/runtime/onert/backend/cpu/KernelGenerator.h
+@@ -94,6 +94,7 @@ public:
+   void visit(const ir::operation::SquaredDifference &) override;
+   void visit(const ir::operation::Tile &) override;
+   void visit(const ir::operation::LogicalOr &) override;
++  void visit(const ir::operation::L2Normalization &) override;
+   void visit(const ir::operation::Range &) override;
+   void visit(const ir::operation::MatrixBandPart &) override;
+   void visit(const ir::operation::BatchMatMul &) override;
+@@ -101,6 +102,8 @@ public:
+   void visit(const ir::operation::FusedBatchNorm &) override;
+   void visit(const ir::operation::LogSoftmax &) override;
+   void visit(const ir::operation::SpaceToBatchND &) override;
++  void visit(const ir::operation::Quantize &) override;
++  void visit(const ir::operation::SpaceToDepth &) override;
+ 
+ private:
+   const ir::Operands &_ctx;
+diff --git a/runtime/onert/backend/cpu/StaticTensorManager.cc b/runtime/onert/backend/cpu/StaticTensorManager.cc
+new file mode 100644
+index 0000000..8723072
+--- /dev/null
++++ b/runtime/onert/backend/cpu/StaticTensorManager.cc
+@@ -0,0 +1,104 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *      http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#include "StaticTensorManager.h"
++#include "Tensor.h"
++
++#include <util/logging.h>
++
++namespace onert
++{
++namespace backend
++{
++namespace cpu
++{
++
++StaticTensorManager::StaticTensorManager(const std::shared_ptr<cpu_common::TensorRegistry> &reg)
++    : _nonconst_mgr{new cpu_common::MemoryManager()}, _tensors{reg}
++{
++  // DO NOTHING
++}
++
++void StaticTensorManager::allocateNonconsts(void)
++{
++  _nonconst_mgr->allocate();
++
++  for (auto &pair : _tensors->native_tensors())
++  {
++    const auto &ind = pair.first;
++    auto tensor = pair.second;
++    if (!_as_constants[ind] && !tensor->is_dynamic())
++    {
++      auto *buffer = _nonconst_mgr->getBuffer(ind);
++      tensor->setBuffer(buffer);
++
++      VERBOSE(CPU_StaticTensorManager) << "TENSOR(#" << ind.value()
++                                       << "): " << static_cast<void *>(buffer) << std::endl;
++    }
++  }
++}
++
++void StaticTensorManager::deallocateNonconsts(void) { _nonconst_mgr->deallocate(); }
++
++void StaticTensorManager::buildTensor(const ir::OperandIndex &ind,
++                                      const ir::OperandInfo &tensor_info, ir::Layout backend_layout,
++                                      bool as_const)
++{
++  assert(!_tensors->getITensor(ind));
++  if (as_const)
++  {
++    auto tensor = std::make_shared<ExternalTensor>(tensor_info, backend_layout);
++    _tensors->setNativeTensor(ind, tensor);
++  }
++  else
++  {
++    auto tensor = std::make_shared<Tensor>(tensor_info, backend_layout);
++    _tensors->setNativeTensor(ind, tensor);
++  }
++  _as_constants[ind] = as_const;
++}
++
++void StaticTensorManager::claimPlan(const ir::OperandIndex &ind, uint32_t size)
++{
++  assert(_tensors->getITensor(ind));
++
++  // This method is called only when a tensor has proper shape
++  assert(!_tensors->getITensor(ind)->is_dynamic());
++
++  if (!_as_constants[ind])
++    _nonconst_mgr->claimPlan(ind, size);
++}
++
++void StaticTensorManager::releasePlan(const ir::OperandIndex &ind)
++{
++  assert(_tensors->getITensor(ind));
++
++  // This method is called only when a tensor has proper shape
++  assert(!_tensors->getITensor(ind)->is_dynamic());
++
++  if (!_as_constants[ind])
++    _nonconst_mgr->releasePlan(ind);
++}
++
++void StaticTensorManager::iterate(const std::function<void(const ir::OperandIndex &)> &fn)
++{
++  for (const auto &it : _tensors->native_tensors())
++    fn(it.first);
++}
++
++} // namespace cpu
++} // namespace backend
++} // namespace onert
+diff --git a/runtime/onert/backend/cpu/StaticTensorManager.h b/runtime/onert/backend/cpu/StaticTensorManager.h
+new file mode 100644
+index 0000000..66243a5
+--- /dev/null
++++ b/runtime/onert/backend/cpu/StaticTensorManager.h
+@@ -0,0 +1,61 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *      http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#ifndef __ONERT_BACKEND_CPU_STATICTENSOR_MANAGER_H__
++#define __ONERT_BACKEND_CPU_STATICTENSOR_MANAGER_H__
++
++#include "backend/IStaticTensorManager.h"
++#include "backend/cpu_common/MemoryManager.h"
++#include "backend/cpu_common/TensorRegistry.h"
++#include "backend/ITensorManager.h"
++#include "ir/OperandIndexMap.h"
++#include "ir/OperandInfo.h"
++
++namespace onert
++{
++namespace backend
++{
++namespace cpu
++{
++
++class StaticTensorManager : public backend::IStaticTensorManager
++{
++public:
++  StaticTensorManager(const std::shared_ptr<cpu_common::TensorRegistry> &reg);
++  virtual ~StaticTensorManager() = default;
++
++  void allocateNonconsts(void);
++  void deallocateNonconsts(void);
++
++  void buildTensor(const ir::OperandIndex &ind, const ir::OperandInfo &tensor_info,
++                   ir::Layout backend_layout, bool as_const);
++
++  void claimPlan(const ir::OperandIndex &ind, uint32_t size);
++  void releasePlan(const ir::OperandIndex &ind);
++
++  void iterate(const std::function<void(const ir::OperandIndex &)> &fn);
++
++private:
++  std::unique_ptr<cpu_common::MemoryManager> _nonconst_mgr;
++  const std::shared_ptr<cpu_common::TensorRegistry> _tensors;
++  ir::OperandIndexMap<bool> _as_constants;
++};
++
++} // namespace cpu
++} // namespace backend
++} // namespace onert
++
++#endif // __ONERT_BACKEND_CPU_STATICTENSOR_MANAGER_H__
+diff --git a/runtime/onert/backend/cpu/Tensor.h b/runtime/onert/backend/cpu/Tensor.h
+index 4dd251b..da16d05 100644
+--- a/runtime/onert/backend/cpu/Tensor.h
++++ b/runtime/onert/backend/cpu/Tensor.h
+@@ -29,8 +29,14 @@ namespace cpu
+ 
+ using Tensor = cpu_common::Tensor;
+ 
+-// Tensor which has data from external. To support this, assume below things
+-// no padding, always NHWC layout, constant tensor and not dynamic
++/**
++ * @brief Class that uses data from external memory that is not managed by a backend
++ *        instead of allocating and copying the data. ExternalTensor's data pointer points to
++ *        an address of memory such as where memory is already allocated, or mmapped area.
++ *        This is meaning that ExternalTensor can take all of types' ir::Data.
++ *        To support this, assume below things no padding, always NHWC layout,
++ *        constant tensor and not dynamic.
++ */
+ class ExternalTensor : public Tensor
+ {
+ public:
+@@ -45,6 +51,11 @@ public:
+   }
+ 
+ public:
++  /**
++   * @brief     set Data to be shared from external so that this ExternalTensor will not be
++   *            allocated on CPU backend
++   * @param[in] data    data of Operand to be set
++   */
+   void setData(const std::shared_ptr<ir::Data> data)
+   {
+     assert(data != nullptr);
+diff --git a/runtime/onert/backend/cpu/TensorBuilder.cc b/runtime/onert/backend/cpu/TensorBuilder.cc
+index 886e8d8..7eb3ce8 100644
+--- a/runtime/onert/backend/cpu/TensorBuilder.cc
++++ b/runtime/onert/backend/cpu/TensorBuilder.cc
+@@ -29,7 +29,7 @@ namespace cpu
+ 
+ TensorBuilder::TensorBuilder()
+     : _tensor_reg{new cpu_common::TensorRegistry()},
+-      _static_tensor_mgr{new cpu_common::StaticTensorManager(_tensor_reg)},
++      _static_tensor_mgr{new StaticTensorManager(_tensor_reg)},
+       _dynamic_tensor_mgr{new cpu_common::DynamicTensorManager(_tensor_reg)}
+ {
+   /* empty */
+@@ -77,11 +77,7 @@ bool TensorBuilder::isRegistered(const ir::OperandIndex &ind) const
+   return _tensor_info_map.find(ind) != _tensor_info_map.end();
+ }
+ 
+-void TensorBuilder::prepare(void)
+-{
+-  _static_tensor_mgr->allocateConsts();
+-  _static_tensor_mgr->allocateNonconsts();
+-}
++void TensorBuilder::prepare(void) { _static_tensor_mgr->allocateNonconsts(); }
+ 
+ void TensorBuilder::allocate()
+ {
+@@ -99,17 +95,17 @@ std::shared_ptr<IPortableTensor> TensorBuilder::portableAt(const ir::OperandInde
+   return _tensor_reg->getPortableTensor(ind);
+ }
+ 
+-bool TensorBuilder::setExternalTensor(const ir::OperandIndex &ind,
+-                                      const std::shared_ptr<IPortableTensor> &tensor)
++bool TensorBuilder::setMigrantTensor(const ir::OperandIndex &ind,
++                                     const std::shared_ptr<IPortableTensor> &tensor)
+ {
+-  return _tensor_reg->setExternalTensor(ind, tensor);
++  return _tensor_reg->setMigrantTensor(ind, tensor);
+ }
+ 
+ void TensorBuilder::iterate(const IterateFunction &fn) { _static_tensor_mgr->iterate(fn); }
+ 
+-std::shared_ptr<cpu_common::Tensor> TensorBuilder::at(const ir::OperandIndex &ind)
++std::shared_ptr<Tensor> TensorBuilder::at(const ir::OperandIndex &ind)
+ {
+-  return _tensor_reg->getManagedTensor(ind);
++  return _tensor_reg->getNativeTensor(ind);
+ }
+ 
+ std::unique_ptr<ITensorManager> TensorBuilder::releaseStaticTensorManager(void)
+diff --git a/runtime/onert/backend/cpu/TensorBuilder.h b/runtime/onert/backend/cpu/TensorBuilder.h
+index ba25451..12ca28c 100644
+--- a/runtime/onert/backend/cpu/TensorBuilder.h
++++ b/runtime/onert/backend/cpu/TensorBuilder.h
+@@ -18,13 +18,14 @@
+ #define __ONERT_BACKEND_CPU_TENSOR_BUILDER_H__
+ 
+ #include <backend/cpu_common/DynamicTensorManager.h>
+-#include <backend/cpu_common/StaticTensorManager.h>
+ #include <backend/cpu_common/TensorRegistry.h>
+-#include <backend/cpu_common/Tensor.h>
+ 
+ #include <backend/ITensorBuilder.h>
+ #include <ir/OperandIndexMap.h>
+ 
++#include "StaticTensorManager.h"
++#include "Tensor.h"
++
+ #include <unordered_map>
+ 
+ namespace onert
+@@ -80,16 +81,16 @@ public:
+    *        If not, program will crash with assert or exception.
+    * @return shared_ptr<Tensor>
+    */
+-  std::shared_ptr<cpu_common::Tensor> at(const ir::OperandIndex &ind);
++  std::shared_ptr<Tensor> at(const ir::OperandIndex &ind);
+   std::shared_ptr<IPortableTensor> portableAt(const ir::OperandIndex &ind);
+-  bool setExternalTensor(const ir::OperandIndex &ind,
+-                         const std::shared_ptr<IPortableTensor> &tensor) override;
++  bool setMigrantTensor(const ir::OperandIndex &ind,
++                        const std::shared_ptr<IPortableTensor> &tensor) override;
+ 
+   std::shared_ptr<ITensorRegistry> tensorRegistry() override { return _tensor_reg; }
+ 
+ private:
+   const std::shared_ptr<cpu_common::TensorRegistry> _tensor_reg;
+-  std::unique_ptr<cpu_common::StaticTensorManager> _static_tensor_mgr;
++  std::unique_ptr<StaticTensorManager> _static_tensor_mgr;
+   std::unique_ptr<cpu_common::DynamicTensorManager> _dynamic_tensor_mgr;
+   ir::OperandIndexMap<ir::OperandInfo> _tensor_info_map;
+ };
+diff --git a/runtime/onert/backend/cpu/ops/CompareLayer.cc b/runtime/onert/backend/cpu/ops/CompareLayer.cc
+index f557f3a..adf902a 100644
+--- a/runtime/onert/backend/cpu/ops/CompareLayer.cc
++++ b/runtime/onert/backend/cpu/ops/CompareLayer.cc
+@@ -17,6 +17,7 @@
+ 
+ #include "OperationUtils.h"
+ 
++#include <assert.h>
+ #include <cker/operation/Comparison.h>
+ using namespace nnfw::cker;
+ namespace onert
+@@ -34,6 +35,14 @@ namespace
+ using OpType = onert::ir::operation::Comparison::ComparisonType;
+ using namespace onert::backend::cpu;
+ 
++// Assumes these enum values to be in the order like this
++static_assert(static_cast<int>(OpType::Equal) == 0, "An OpType value has changed!");
++static_assert(static_cast<int>(OpType::NotEqual) == 1, "An OpType value has changed!");
++static_assert(static_cast<int>(OpType::Greater) == 2, "An OpType value has changed!");
++static_assert(static_cast<int>(OpType::GreaterEqual) == 3, "An OpType value has changed!");
++static_assert(static_cast<int>(OpType::Less) == 4, "An OpType value has changed!");
++static_assert(static_cast<int>(OpType::LessEqual) == 5, "An OpType value has changed!");
++
+ template <typename T>
+ void compareQuant8(const IPortableTensor *lhs, const IPortableTensor *rhs, IPortableTensor *output,
+                    OpType op_type)
+@@ -52,95 +61,33 @@ void compareQuant8(const IPortableTensor *lhs, const IPortableTensor *rhs, IPort
+                                       &params.input2_shift);
+   params.is_broadcast = !HaveSameShapes(lhs, rhs);
+ 
+-  if (params.is_broadcast)
+-  {
+-    switch (op_type)
+-    {
+-      case OpType::Equal:
+-        Broadcast4DSlowEqualWithScaling(
+-            params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
+-            getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
+-            getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
+-        break;
+-      case OpType::NotEqual:
+-        Broadcast4DSlowNotEqualWithScaling(
+-            params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
+-            getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
+-            getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
+-        break;
+-      case OpType::Greater:
+-        Broadcast4DSlowGreaterWithScaling(
+-            params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
+-            getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
+-            getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
+-        break;
+-      case OpType::GreaterEqual:
+-        Broadcast4DSlowGreaterEqualWithScaling(
+-            params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
+-            getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
+-            getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
+-        break;
+-      case OpType::Less:
+-        Broadcast4DSlowLessWithScaling(
+-            params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
+-            getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
+-            getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
+-        break;
+-      case OpType::LessEqual:
+-        Broadcast4DSlowLessEqualWithScaling(
+-            params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
+-            getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
+-            getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
+-        break;
+-      default:
+-        throw std::runtime_error{"Invalid OpType for CompareLayer"};
+-    }
+-  }
+-  else // if (requires_broadcast == false)
+-  {
+-    switch (op_type)
+-    {
+-      case OpType::Equal:
+-        EqualWithScaling(params, getExtendedTensorShape(lhs),
+-                         reinterpret_cast<const T *>(lhs->buffer()), getExtendedTensorShape(rhs),
+-                         reinterpret_cast<const T *>(rhs->buffer()), getExtendedTensorShape(output),
+-                         reinterpret_cast<bool *>(output->buffer()));
+-        break;
+-      case OpType::NotEqual:
+-        NotEqualWithScaling(
+-            params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
+-            getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
+-            getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
+-        break;
+-      case OpType::Greater:
+-        GreaterWithScaling(
+-            params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
+-            getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
+-            getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
+-        break;
+-      case OpType::GreaterEqual:
+-        GreaterEqualWithScaling(
+-            params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
+-            getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
+-            getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
+-        break;
+-      case OpType::Less:
+-        LessWithScaling(params, getExtendedTensorShape(lhs),
+-                        reinterpret_cast<const T *>(lhs->buffer()), getExtendedTensorShape(rhs),
+-                        reinterpret_cast<const T *>(rhs->buffer()), getExtendedTensorShape(output),
+-                        reinterpret_cast<bool *>(output->buffer()));
+-        break;
+-      case OpType::LessEqual:
+-        LessEqualWithScaling(
+-            params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
+-            getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
+-            getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
+-        break;
+-      default:
+-        throw std::runtime_error{"Invalid OpType for CompareLayer"};
+-    }
+-  }
+-  return;
++  using CompareFunction =
++      void (*)(ComparisonParams & params, const Shape &input1_shape, const T *input1_data,
++               const Shape &input2_shape, const T *input2_data, const Shape &output_shape,
++               bool *output_data);
++
++  static const CompareFunction broadcast_fns[] = {
++      Broadcast4DSlowEqualWithScaling,   Broadcast4DSlowNotEqualWithScaling,
++      Broadcast4DSlowGreaterWithScaling, Broadcast4DSlowGreaterEqualWithScaling,
++      Broadcast4DSlowLessWithScaling,    Broadcast4DSlowLessEqualWithScaling,
++  };
++  static const CompareFunction non_broadcast_fns[] = {
++      EqualWithScaling,        NotEqualWithScaling, GreaterWithScaling,
++      GreaterEqualWithScaling, LessWithScaling,     LessEqualWithScaling,
++  };
++
++  static_assert(sizeof(broadcast_fns) == sizeof(non_broadcast_fns),
++                "Sizes of broadcast_fns and non_broadcast_fns must match!");
++
++  auto index = static_cast<int>(op_type);
++  if (index < 0 || index >= static_cast<int>(sizeof(broadcast_fns) / sizeof(broadcast_fns[0])))
++    throw std::runtime_error{"Invalid OpType for CompareLayer"};
++
++  CompareFunction fn = (params.is_broadcast ? broadcast_fns[index] : non_broadcast_fns[index]);
++
++  fn(params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
++     getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
++     getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
+ }
+ 
+ template <typename T>
+@@ -149,94 +96,33 @@ void compareScalar(const IPortableTensor *lhs, const IPortableTensor *rhs, IPort
+ {
+   bool requires_broadcast = !HaveSameShapes(lhs, rhs);
+ 
+-  if (requires_broadcast)
+-  {
+-    switch (op_type)
+-    {
+-      case OpType::Equal:
+-        Broadcast4DSlowEqual(
+-            getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
+-            getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
+-            getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
+-        break;
+-      case OpType::NotEqual:
+-        Broadcast4DSlowNotEqual(
+-            getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
+-            getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
+-            getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
+-        break;
+-      case OpType::Greater:
+-        Broadcast4DSlowGreater(
+-            getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
+-            getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
+-            getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
+-        break;
+-      case OpType::GreaterEqual:
+-        Broadcast4DSlowGreaterEqual(
+-            getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
+-            getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
+-            getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
+-        break;
+-      case OpType::Less:
+-        Broadcast4DSlowLess(getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
+-                            getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
+-                            getExtendedTensorShape(output),
+-                            reinterpret_cast<bool *>(output->buffer()));
+-        break;
+-      case OpType::LessEqual:
+-        Broadcast4DSlowLessEqual(
+-            getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
+-            getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
+-            getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
+-        break;
+-      default:
+-        throw std::runtime_error{"Invalid OpType for CompareLayer"};
+-    }
+-  }
+-  else // if (requires_broadcast == false)
+-  {
+-    switch (op_type)
+-    {
+-      case OpType::Equal:
+-        EqualNoScaling(getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
+-                       getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
+-                       getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
+-        break;
+-      case OpType::NotEqual:
+-        NotEqualNoScaling(getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
+-                          getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
+-                          getExtendedTensorShape(output),
+-                          reinterpret_cast<bool *>(output->buffer()));
+-        break;
+-      case OpType::Greater:
+-        GreaterNoScaling(getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
+-                         getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
+-                         getExtendedTensorShape(output),
+-                         reinterpret_cast<bool *>(output->buffer()));
+-        break;
+-      case OpType::GreaterEqual:
+-        GreaterEqualNoScaling(
+-            getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
+-            getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
+-            getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
+-        break;
+-      case OpType::Less:
+-        LessNoScaling(getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
+-                      getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
+-                      getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
+-        break;
+-      case OpType::LessEqual:
+-        LessEqualNoScaling(getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
+-                           getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
+-                           getExtendedTensorShape(output),
+-                           reinterpret_cast<bool *>(output->buffer()));
+-        break;
+-      default:
+-        throw std::runtime_error{"Invalid OpType for CompareLayer"};
+-    }
+-  }
+-  return;
++  using CompareFunction =
++      void (*)(const Shape &input1_shape, const T *input1_data, const Shape &input2_shape,
++               const T *input2_data, const Shape &output_shape, bool *output_data);
++
++  static const CompareFunction broadcast_fns[] = {
++      Broadcast4DSlowEqual,        Broadcast4DSlowNotEqual, Broadcast4DSlowGreater,
++      Broadcast4DSlowGreaterEqual, Broadcast4DSlowLess,     Broadcast4DSlowLessEqual,
++  };
++  static const CompareFunction non_broadcast_fns[] = {
++      EqualNoScaling,        NotEqualNoScaling, GreaterNoScaling,
++      GreaterEqualNoScaling, LessNoScaling,     LessEqualNoScaling,
++  };
++
++  static_assert(sizeof(broadcast_fns) == sizeof(non_broadcast_fns),
++                "Sizes of broadcast_fns and non_broadcast_fns must match!");
++
++  auto index = static_cast<int>(op_type);
++  if (index < 0 || index >= static_cast<int>(sizeof(broadcast_fns) / sizeof(broadcast_fns[0])))
++    throw std::runtime_error{"Invalid OpType for CompareLayer"};
++
++  CompareFunction fn = (requires_broadcast ? broadcast_fns[index] : non_broadcast_fns[index]);
++
++  fn(getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
++     getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
++     getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
+ }
++
+ } // namespace
+ 
+ CompareLayer::CompareLayer()
+diff --git a/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc b/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc
+index c00be64..ff22e32 100644
+--- a/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc
++++ b/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc
+@@ -18,6 +18,7 @@
+ 
+ #include "../Tensor.h"
+ #include <cker/operation/FullyConnected.h>
++#include <cker/TensorUtils.h>
+ 
+ namespace onert
+ {
+@@ -112,15 +113,32 @@ void FullyConnectedLayer::fullyConnectedHybrid()
+       getTensorShape(_bias), reinterpret_cast<const float *>(_bias ? _bias->buffer() : nullptr),
+       getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()), temp_arena);
+ 
+-// TODO Enable calling decrease_ref
+-#if 0
++// TODO Remove this ifdef
++#ifdef EXPERIMENTAL_RUY_FEATURE
+   if (_cached_weights == nullptr || _is_weights_freed)
+     return;
+ 
++  // '_cached_weights is not nullptr and _is_weights_freed is false' means
++  // this weight shape is satisfied with the ruy kernel's prepack cache's condition.
++  // After entering here, it will not enter again except below the case - input is zero-vector
++
++  // if input's elements are filled with zero, it by-passes(does not enter ruy-kernel path)
++  // so that handle this case
++  const int input_size = getTensorShape(_input).FlatSize();
++  if (nnfw::cker::IsZeroVector(reinterpret_cast<float *>(_input->buffer()), input_size))
++    return;
++
++  // This weight tensor could be other ops' const tensor.
++  // Therefore, below reference should be checked like following
+   auto weight_tensor = dynamic_cast<const Tensor *>(_weights);
+   if (weight_tensor)
+   {
+     auto tensor = const_cast<Tensor *>(weight_tensor);
++    if (tensor->buffer() == nullptr) // ref is already 0?
++    {
++      _is_weights_freed = true;
++      return;
++    }
+ 
+     tensor->decrease_ref();
+     if (tensor->buffer() == nullptr) // ref == 0?
+@@ -128,7 +146,7 @@ void FullyConnectedLayer::fullyConnectedHybrid()
+       _is_weights_freed = true;
+     }
+   }
+-#endif // if 0
++#endif
+ #endif
+ }
+ 
+@@ -167,7 +185,17 @@ void FullyConnectedLayer::run()
+ 
+ void FullyConnectedLayer::prepare()
+ {
++  if (_bias && _bias->is_constant())
++  {
++    const int bias_size = getTensorShape(_bias).FlatSize();
++    if (nnfw::cker::IsZeroVector(reinterpret_cast<float *>(_bias->buffer()), bias_size))
++    {
++      _bias = nullptr;
++    }
++  }
++
+ #ifdef USE_RUY_GEMV
++#ifdef EXPERIMENTAL_RUY_FEATURE
+   // TODO This is workaround
+   // The only fc hybrid will use ruy kernel
+   if (_input->data_type() != OperandType::FLOAT32 ||
+@@ -199,6 +227,7 @@ void FullyConnectedLayer::prepare()
+     }
+   }
+ #endif
++#endif
+ }
+ 
+ } // namespace ops
+diff --git a/runtime/onert/backend/cpu/ops/FullyConnectedLayer.h b/runtime/onert/backend/cpu/ops/FullyConnectedLayer.h
+index dd5ef24..e405b24 100644
+--- a/runtime/onert/backend/cpu/ops/FullyConnectedLayer.h
++++ b/runtime/onert/backend/cpu/ops/FullyConnectedLayer.h
+@@ -72,6 +72,9 @@ private:
+ 
+ #ifdef USE_RUY_GEMV
+   uint8_t *_cached_weights = nullptr; // weights to be cached and a key
++#ifdef EXPERIMENTAL_RUY_FEATURE
++  bool _is_weights_freed = false; // is weights freed?
++#endif
+ #endif
+ };
+ 
+diff --git a/runtime/onert/backend/cpu/ops/L2NormLayer.cc b/runtime/onert/backend/cpu/ops/L2NormLayer.cc
+new file mode 100644
+index 0000000..0d99b05
+--- /dev/null
++++ b/runtime/onert/backend/cpu/ops/L2NormLayer.cc
+@@ -0,0 +1,71 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *      http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#include "L2NormLayer.h"
++
++#include "OperationUtils.h"
++
++#include <cker/operation/L2Normalize.h>
++#include <cker/Types.h>
++
++namespace onert
++{
++namespace backend
++{
++namespace cpu
++{
++namespace ops
++{
++
++void L2NormLayer::configure(const IPortableTensor *input, IPortableTensor *output)
++{
++  assert(input != nullptr);
++  assert(output != nullptr);
++
++  _input = input;
++  _output = output;
++}
++
++void L2NormLayer::run()
++{
++  switch (_input->data_type())
++  {
++    case OperandType::FLOAT32:
++      nnfw::cker::L2NormalizeFloat32(
++          getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()),
++          getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()));
++      break;
++
++    case OperandType::QUANT_UINT8_ASYMM:
++    {
++      nnfw::cker::L2NormParams params;
++      assert(_input->data_offset() == 128);
++      params.input_zero_point = _input->data_offset();
++      nnfw::cker::L2NormalizeQuant8(
++          params, getTensorShape(_input), reinterpret_cast<const uint8_t *>(_input->buffer()),
++          getTensorShape(_output), reinterpret_cast<uint8_t *>(_output->buffer()));
++    }
++    break;
++
++    default:
++      throw std::runtime_error{"L2Norm: Unsupported data type"};
++  }
++}
++
++} // namespace ops
++} // namespace cpu
++} // namespace backend
++} // namespace onert
+diff --git a/runtime/onert/backend/cpu/ops/L2NormLayer.h b/runtime/onert/backend/cpu/ops/L2NormLayer.h
+new file mode 100644
+index 0000000..63f2d11
+--- /dev/null
++++ b/runtime/onert/backend/cpu/ops/L2NormLayer.h
+@@ -0,0 +1,55 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *      http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in riting, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#ifndef __ONERT_BACKEND_CPU_OPS_L2NORM_LAYER_H__
++#define __ONERT_BACKEND_CPU_OPS_L2NORM_LAYER_H__
++
++#include <backend/IPortableTensor.h>
++
++#include <exec/IFunction.h>
++
++namespace onert
++{
++namespace backend
++{
++namespace cpu
++{
++namespace ops
++{
++class L2NormLayer : public ::onert::exec::IFunction
++{
++public:
++  L2NormLayer() : _input(nullptr), _output(nullptr)
++  {
++    // Nothing
++  }
++
++public:
++  void configure(const IPortableTensor *_input, IPortableTensor *output);
++
++  void run() override;
++
++private:
++  const IPortableTensor *_input;
++  IPortableTensor *_output;
++};
++
++} // namespace ops
++} // namespace cpu
++} // namespace backend
++} // namespace onert
++
++#endif // __ONERT_BACKEND_CPU_OPS_L2NORM_LAYER_H__
+diff --git a/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.cc b/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.cc
+index d71e325..06dde4f 100644
+--- a/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.cc
++++ b/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.cc
+@@ -49,8 +49,8 @@ void LogSoftMaxLayer::logsoftmaxQuant8()
+   // NYI
+ }
+ 
+-void LogSoftMaxLayer::configure(const Tensor *input, const float beta, const int axis,
+-                                Tensor *output)
++void LogSoftMaxLayer::configure(const IPortableTensor *input, const float beta, const int axis,
++                                IPortableTensor *output)
+ {
+   _input = input;
+   _output = output;
+diff --git a/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.h b/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.h
+index bc145ce..ba9deca 100644
+--- a/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.h
++++ b/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.h
+@@ -40,13 +40,14 @@ public:
+ 
+   void logsoftmaxQuant8();
+ 
+-  void configure(const Tensor *input, const float beta, const int axis, Tensor *output);
++  void configure(const IPortableTensor *input, const float beta, const int axis,
++                 IPortableTensor *output);
+ 
+   void run();
+ 
+ private:
+-  const Tensor *_input;
+-  Tensor *_output;
++  const IPortableTensor *_input;
++  IPortableTensor *_output;
+ 
+   float _beta;
+   int _axis;
+diff --git a/runtime/onert/backend/cpu/ops/OperationUtils.h b/runtime/onert/backend/cpu/ops/OperationUtils.h
+index 8d29374..9838552 100644
+--- a/runtime/onert/backend/cpu/ops/OperationUtils.h
++++ b/runtime/onert/backend/cpu/ops/OperationUtils.h
+@@ -52,6 +52,17 @@ union DataPtr {
+   void *v;
+ };
+ 
++union ConstDataPtr {
++  const uint8_t *u8;
++  const int8_t *i8;
++  const uint32_t *u32;
++  const int32_t *i32;
++  const bool *b;
++  const float *f;
++  const int64_t *i64;
++  const void *v;
++};
++
+ uint32_t getNumberOfDimensions(const IPortableTensor *tensor);
+ 
+ uint32_t getNumberOfElements(const IPortableTensor *tensor);
+diff --git a/runtime/onert/backend/cpu/ops/PadLayer.cc b/runtime/onert/backend/cpu/ops/PadLayer.cc
+index fcfcf7b..6a2bf9d 100644
+--- a/runtime/onert/backend/cpu/ops/PadLayer.cc
++++ b/runtime/onert/backend/cpu/ops/PadLayer.cc
+@@ -33,33 +33,40 @@ PadLayer::PadLayer()
+   // DO NOTHING
+ }
+ 
+-void PadLayer::padFloat32()
++template <typename T> void PadLayer::padImpl(const T *constant_value_data)
+ {
+-  nnfw::cker::Pad(_padData, _padRank, getTensorShape(_input),
+-                  reinterpret_cast<const float *>(_input->buffer()), getTensorShape(_output),
+-                  reinterpret_cast<float *>(_output->buffer()), _constantValueData.f);
++  nnfw::cker::Pad<T>(_padData, _padRank, getTensorShape(_input),
++                     reinterpret_cast<const T *>(_input->buffer()), getTensorShape(_output),
++                     reinterpret_cast<T *>(_output->buffer()), constant_value_data);
+ }
+-void PadLayer::padQuant8() { throw std::runtime_error("Quantized Pad isn't supported NYI"); }
+ 
+ void PadLayer::configure(const IPortableTensor *input, IPortableTensor *output,
+-                         const int32_t *padData, int32_t padRank, uint8_t *constantValueData)
++                         const int32_t *padData, int32_t padRank, const void *constantValueData)
+ {
+   _input = input;
+   _output = output;
+   memcpy(_padData, padData, sizeof(_padData));
+   _padRank = padRank;
+-  _constantValueData.u8 = constantValueData;
++  _constantValueData.v = constantValueData;
+ }
+ 
+ void PadLayer::run()
+ {
+   if (_input->data_type() == OperandType::FLOAT32)
+   {
+-    padFloat32();
++    padImpl<float>(_constantValueData.f);
+   }
+   else if (_input->data_type() == OperandType::QUANT_UINT8_ASYMM)
+   {
+-    padQuant8();
++    if (_constantValueData.u8 == nullptr)
++    {
++      uint8_t pad_value = static_cast<uint8_t>(_output->data_offset());
++      padImpl<uint8_t>(&pad_value);
++    }
++    else
++    {
++      padImpl<uint8_t>(_constantValueData.u8);
++    }
+   }
+   else
+   {
+diff --git a/runtime/onert/backend/cpu/ops/PadLayer.h b/runtime/onert/backend/cpu/ops/PadLayer.h
+index 85bd2e6..efd73d5 100644
+--- a/runtime/onert/backend/cpu/ops/PadLayer.h
++++ b/runtime/onert/backend/cpu/ops/PadLayer.h
+@@ -39,12 +39,10 @@ public:
+   PadLayer();
+ 
+ public:
+-  void padFloat32();
+-
+-  void padQuant8();
++  template <typename T> void padImpl(const T *constant_value_data);
+ 
+   void configure(const IPortableTensor *input, IPortableTensor *output, const int32_t *padData,
+-                 int32_t padRank, uint8_t *constantValueData = nullptr);
++                 int32_t padRank, const void *constantValueData = nullptr);
+ 
+   void run() override;
+ 
+@@ -54,7 +52,7 @@ private:
+ 
+   int32_t _padData[8];
+   int32_t _padRank;
+-  DataPtr _constantValueData;
++  ConstDataPtr _constantValueData;
+ };
+ 
+ } // namespace ops
+diff --git a/runtime/onert/backend/cpu/ops/QuantizeLayer.cc b/runtime/onert/backend/cpu/ops/QuantizeLayer.cc
+new file mode 100644
+index 0000000..45fc148
+--- /dev/null
++++ b/runtime/onert/backend/cpu/ops/QuantizeLayer.cc
+@@ -0,0 +1,63 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *      http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#include "QuantizeLayer.h"
++
++#include <cker/operation/Quantize.h>
++
++namespace onert
++{
++namespace backend
++{
++namespace cpu
++{
++namespace ops
++{
++
++QuantizeLayer::QuantizeLayer() : _input(nullptr), _output(nullptr)
++{
++  // DO NOTHING
++}
++
++template <typename InputT, typename OutputT> void QuantizeLayer::affineQuantize()
++{
++  nnfw::cker::Quantize(getTensorShape(_input), reinterpret_cast<const InputT *>(_input->buffer()),
++                       getTensorShape(_output), reinterpret_cast<OutputT *>(_output->buffer()),
++                       _output->data_scale(), _output->data_offset());
++}
++
++void QuantizeLayer::configure(const IPortableTensor *input, IPortableTensor *output)
++{
++  _input = input;
++  _output = output;
++}
++
++void QuantizeLayer::run()
++{
++  if (_input->data_type() == OperandType::FLOAT32)
++  {
++    affineQuantize<float, uint8_t>();
++  }
++  else
++  {
++    throw std::runtime_error{"Quantize: unsupported data type"};
++  }
++}
++
++} // namespace ops
++} // namespace cpu
++} // namespace backend
++} // namespace onert
+diff --git a/runtime/onert/backend/cpu/ops/QuantizeLayer.h b/runtime/onert/backend/cpu/ops/QuantizeLayer.h
+new file mode 100644
+index 0000000..b4e7aca
+--- /dev/null
++++ b/runtime/onert/backend/cpu/ops/QuantizeLayer.h
+@@ -0,0 +1,56 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *      http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#ifndef __ONERT_BACKEND_CPU_OPS_QUANTIZELAYER_H__
++#define __ONERT_BACKEND_CPU_OPS_QUANTIZELAYER_H__
++
++#include <backend/IPortableTensor.h>
++#include "OperationUtils.h"
++
++#include <exec/IFunction.h>
++
++namespace onert
++{
++namespace backend
++{
++namespace cpu
++{
++namespace ops
++{
++
++class QuantizeLayer : public ::onert::exec::IFunction
++{
++public:
++  QuantizeLayer();
++
++public:
++  template <typename InputT, typename OutputT> void affineQuantize();
++
++  void configure(const IPortableTensor *input, IPortableTensor *output);
++
++  void run() override;
++
++private:
++  const IPortableTensor *_input;
++  IPortableTensor *_output;
++};
++
++} // namespace ops
++} // namespace cpu
++} // namespace backend
++} // namespace onert
++
++#endif // __ONERT_BACKEND_CPU_OPS_QUANTIZELAYER_H__
+diff --git a/runtime/onert/backend/cpu/ops/SliceLayer.cc b/runtime/onert/backend/cpu/ops/SliceLayer.cc
+index a9106c1..449c073 100644
+--- a/runtime/onert/backend/cpu/ops/SliceLayer.cc
++++ b/runtime/onert/backend/cpu/ops/SliceLayer.cc
+@@ -46,7 +46,7 @@ void SliceLayer::GetBeginAndSizeVectors(int dimensions, const IPortableTensor *b
+   }
+ }
+ 
+-void SliceLayer::sliceFloat32()
++template <typename T> void SliceLayer::sliceImpl()
+ {
+   const int kMaxDim = nnfw::cker::Shape::kMaxSmallSize;
+ 
+@@ -74,14 +74,8 @@ void SliceLayer::sliceFloat32()
+   }
+ 
+   nnfw::cker::Slice(op_params, getExtendedTensorShape(_input),
+-                    reinterpret_cast<const float *>(_input->buffer()),
+-                    reinterpret_cast<float *>(_output->buffer()));
+-}
+-
+-void SliceLayer::sliceQuant8()
+-{
+-  // cker quant8 slice is not implemented yet
+-  throw std::runtime_error{"NYI"};
++                    reinterpret_cast<const T *>(_input->buffer()),
++                    reinterpret_cast<T *>(_output->buffer()));
+ }
+ 
+ void SliceLayer::configure(const IPortableTensor *input, const IPortableTensor *begin,
+@@ -97,11 +91,11 @@ void SliceLayer::run()
+ {
+   if (_input->data_type() == OperandType::FLOAT32)
+   {
+-    sliceFloat32();
++    sliceImpl<float>();
+   }
+   else if (_input->data_type() == OperandType::QUANT_UINT8_ASYMM)
+   {
+-    sliceQuant8();
++    sliceImpl<uint8_t>();
+   }
+   else
+   {
+diff --git a/runtime/onert/backend/cpu/ops/SliceLayer.h b/runtime/onert/backend/cpu/ops/SliceLayer.h
+index 9945d7e..650e2c9 100644
+--- a/runtime/onert/backend/cpu/ops/SliceLayer.h
++++ b/runtime/onert/backend/cpu/ops/SliceLayer.h
+@@ -42,8 +42,7 @@ public:
+   void run() override;
+ 
+ private:
+-  void sliceFloat32();
+-  void sliceQuant8();
++  template <typename T> void sliceImpl();
+ 
+   template <typename T>
+   void GetBeginAndSizeVectors(int dimensions, const IPortableTensor *begin,
+diff --git a/runtime/onert/backend/cpu/ops/SpaceToDepthLayer.cc b/runtime/onert/backend/cpu/ops/SpaceToDepthLayer.cc
+new file mode 100644
+index 0000000..110b0bc
+--- /dev/null
++++ b/runtime/onert/backend/cpu/ops/SpaceToDepthLayer.cc
+@@ -0,0 +1,70 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *      http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#include "SpaceToDepthLayer.h"
++
++#include "OperationUtils.h"
++
++#include <cker/operation/SpaceToDepth.h>
++
++namespace onert
++{
++namespace backend
++{
++namespace cpu
++{
++namespace ops
++{
++SpaceToDepthLayer::SpaceToDepthLayer() : _input(nullptr), _block_size(0), _output(nullptr)
++{
++  // DO NOTHING
++}
++
++template <typename T> void SpaceToDepthLayer::spaceToDepth()
++{
++
++  nnfw::cker::SpaceToDepthParams params;
++  params.block_size = _block_size;
++
++  nnfw::cker::SpaceToDepth(params, getTensorShape(_input),
++                           reinterpret_cast<const float *>(_input->buffer()),
++                           getTensorShape(_output), reinterpret_cast<T *>(_output->buffer()));
++}
++
++void SpaceToDepthLayer::configure(const IPortableTensor *input, const int32_t block_size,
++                                  IPortableTensor *output)
++{
++  _input = input;
++  _block_size = block_size;
++  _output = output;
++}
++
++void SpaceToDepthLayer::run()
++{
++  if (_input->data_type() == OperandType::FLOAT32)
++  {
++    spaceToDepth<float>();
++  }
++  else
++  {
++    throw std::runtime_error{"SpaceToDepth: unsupported data type"};
++  }
++}
++
++} // namespace ops
++} // namespace cpu
++} // namespace backend
++} // namespace onert
+diff --git a/runtime/onert/backend/cpu/ops/SpaceToDepthLayer.h b/runtime/onert/backend/cpu/ops/SpaceToDepthLayer.h
+new file mode 100644
+index 0000000..c11ef2b
+--- /dev/null
++++ b/runtime/onert/backend/cpu/ops/SpaceToDepthLayer.h
+@@ -0,0 +1,54 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *      http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in riting, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#ifndef __ONERT_BACKEND_CPU_OPS_SPACE_TO_DEPTH_LAYER_H__
++#define __ONERT_BACKEND_CPU_OPS_SPACE_TO_DEPTH_LAYER_H__
++
++#include <backend/IPortableTensor.h>
++
++#include <exec/IFunction.h>
++
++namespace onert
++{
++namespace backend
++{
++namespace cpu
++{
++namespace ops
++{
++class SpaceToDepthLayer : public ::onert::exec::IFunction
++{
++public:
++  SpaceToDepthLayer();
++
++  void configure(const IPortableTensor *input, const int32_t block_size, IPortableTensor *output);
++
++  void run() override;
++
++private:
++  template <typename T> void spaceToDepth();
++
++  const IPortableTensor *_input;
++  int32_t _block_size;
++  IPortableTensor *_output;
++};
++
++} // namespace ops
++} // namespace cpu
++} // namespace backend
++} // namespace onert
++
++#endif // __ONERT_BACKEND_CPU_OPS_SPACE_TO_BATCH_ND_LAYER_H__
+diff --git a/runtime/onert/core/include/backend/ITensorBuilder.h b/runtime/onert/core/include/backend/ITensorBuilder.h
+index a49525b..b760cda 100644
+--- a/runtime/onert/core/include/backend/ITensorBuilder.h
++++ b/runtime/onert/core/include/backend/ITensorBuilder.h
+@@ -112,12 +112,12 @@ public: // methods for static tensor allocation
+   virtual std::shared_ptr<ITensor> tensorAt(const ir::OperandIndex &ind) = 0;
+ 
+   /**
+-   * @brief Set the External Tensor object
++   * @brief Set the migrant tensor object
+    *
+    * @return true if succeeded
+    * @return false if failed or unsupported
+    */
+-  virtual bool setExternalTensor(const ir::OperandIndex &, const std::shared_ptr<IPortableTensor> &)
++  virtual bool setMigrantTensor(const ir::OperandIndex &, const std::shared_ptr<IPortableTensor> &)
+   {
+     return false;
+   }
+diff --git a/runtime/onert/core/include/backend/ITensorRegistry.h b/runtime/onert/core/include/backend/ITensorRegistry.h
+index f5a95f4..8555131 100644
+--- a/runtime/onert/core/include/backend/ITensorRegistry.h
++++ b/runtime/onert/core/include/backend/ITensorRegistry.h
+@@ -35,17 +35,22 @@ struct ITensorRegistry
+   virtual ~ITensorRegistry() = default;
+ 
+   /**
+-   * @brief Returns pointer of ITensor among managed and external tensors
++   * @brief Returns pointer of ITensor among native and migrant tensors
++   *
++   * Native Tensor is a tensor that is managed by this backend
++   * Migrant Tensor is a tensor that is imported from another backend
++   *
+    * @note  Return tensor cannot be used longer than dynamic tensor manager
+    */
+   virtual std::shared_ptr<ITensor> getITensor(const ir::OperandIndex &) = 0;
+   /**
+-   * @brief Returns pointer of ITensor among managed tensors
++   * @brief Returns pointer of ITensor among native tensors
+    *
+-   * Unlike @c getITensor , this function only searches from managed tensors
+-   * @note  Return tensor cannot be used longer than dynamic tensor manager
++   * Unlike @c getITensor , this function only searches from native tensors
++   *
++   * @note  Returned tensor cannot be used longer than dynamic tensor manager
+    */
+-  virtual std::shared_ptr<ITensor> getManagedITensor(const ir::OperandIndex &) = 0;
++  virtual std::shared_ptr<ITensor> getNativeITensor(const ir::OperandIndex &) = 0;
+ };
+ 
+ } // namespace backend
+@@ -73,68 +78,67 @@ public:
+   std::shared_ptr<ITensor> getITensor(const ir::OperandIndex &ind) override
+   {
+     static_assert(std::is_base_of<ITensor, T_Tensor>::value, "T_Tensor must derive from ITensor.");
+-    auto external_tensor = _external.find(ind);
+-    if (external_tensor != _external.end())
++    auto external_tensor = _migrant.find(ind);
++    if (external_tensor != _migrant.end())
+       return external_tensor->second;
+-    return getManagedTensor(ind);
++    return getNativeTensor(ind);
+   }
+ 
+-  std::shared_ptr<ITensor> getManagedITensor(const ir::OperandIndex &ind) override
++  std::shared_ptr<ITensor> getNativeITensor(const ir::OperandIndex &ind) override
+   {
+-    return getManagedTensor(ind);
++    return getNativeTensor(ind);
+   }
+ 
+   std::shared_ptr<IPortableTensor> getPortableTensor(const ir::OperandIndex &ind)
+   {
+-    auto external_tensor = _external.find(ind);
+-    if (external_tensor != _external.end())
++    auto external_tensor = _migrant.find(ind);
++    if (external_tensor != _migrant.end())
+     {
+       if (external_tensor->second)
+         return external_tensor->second;
+     }
+-    return getManagedTensor(ind);
++    return getNativeTensor(ind);
+   }
+ 
+-  std::shared_ptr<T_Tensor> getManagedTensor(const ir::OperandIndex &ind)
++  std::shared_ptr<T_Tensor> getNativeTensor(const ir::OperandIndex &ind)
+   {
+-    auto tensor = _managed.find(ind);
+-    if (tensor != _managed.end())
++    auto tensor = _native.find(ind);
++    if (tensor != _native.end())
+       return tensor->second;
+     return nullptr;
+   }
+ 
+-  bool setExternalTensor(const ir::OperandIndex &ind,
+-                         const std::shared_ptr<IPortableTensor> &tensor)
++  bool setMigrantTensor(const ir::OperandIndex &ind, const std::shared_ptr<IPortableTensor> &tensor)
+   {
+     // TODO Uncomment this as two tensors for an index is not allowed.
+     //      But now it is temporarily allowed as a workaround. External one hides Managed one.
+-    // auto itr = _managed.find(ind);
+-    // if (itr != _managed.end() && itr->second != nullptr && tensor != nullptr)
++    // auto itr = _native.find(ind);
++    // if (itr != _native.end() && itr->second != nullptr && tensor != nullptr)
+     //  throw std::runtime_error{
+-    //      "Tried to set an external tensor but an managed tensor already exists."};
+-    _external[ind] = tensor;
++    //      "Tried to set an migrant tensor but an native tensor already exists."};
++    _migrant[ind] = tensor;
+     return true;
+   }
+ 
+-  void setManagedTensor(const ir::OperandIndex &ind, const std::shared_ptr<T_Tensor> &tensor)
++  void setNativeTensor(const ir::OperandIndex &ind, const std::shared_ptr<T_Tensor> &tensor)
+   {
+-    auto itr = _external.find(ind);
+-    if (itr != _external.end() && itr->second != nullptr && tensor != nullptr)
++    auto itr = _migrant.find(ind);
++    if (itr != _migrant.end() && itr->second != nullptr && tensor != nullptr)
+       throw std::runtime_error{
+-          "Tried to set a managed tensor but an external tensor already exists."};
+-    _managed[ind] = tensor;
++          "Tried to set a native tensor but an migrant tensor already exists."};
++    _native[ind] = tensor;
+   }
+ 
+-  const ir::OperandIndexMap<std::shared_ptr<T_Tensor>> &managed_tensors() { return _managed; }
++  const ir::OperandIndexMap<std::shared_ptr<T_Tensor>> &native_tensors() { return _native; }
+ 
+-  const ir::OperandIndexMap<std::shared_ptr<IPortableTensor>> &external_tensors()
++  const ir::OperandIndexMap<std::shared_ptr<IPortableTensor>> &migrant_tensors()
+   {
+-    return _external;
++    return _migrant;
+   }
+ 
+ private:
+-  ir::OperandIndexMap<std::shared_ptr<IPortableTensor>> _external;
+-  ir::OperandIndexMap<std::shared_ptr<T_Tensor>> _managed;
++  ir::OperandIndexMap<std::shared_ptr<IPortableTensor>> _migrant;
++  ir::OperandIndexMap<std::shared_ptr<T_Tensor>> _native;
+ };
+ 
+ } // namespace backend
+diff --git a/runtime/onert/core/include/backend/cpu_common/StaticTensorManager.h b/runtime/onert/core/include/backend/cpu_common/StaticTensorManager.h
+index 6ddacc7..a7e034a 100644
+--- a/runtime/onert/core/include/backend/cpu_common/StaticTensorManager.h
++++ b/runtime/onert/core/include/backend/cpu_common/StaticTensorManager.h
+@@ -19,7 +19,7 @@
+ 
+ #include "MemoryManager.h"
+ 
+-#include "backend/ITensorManager.h"
++#include "backend/IStaticTensorManager.h"
+ #include "ir/OperandIndexMap.h"
+ #include "ir/OperandInfo.h"
+ #include "TensorRegistry.h"
+@@ -31,7 +31,7 @@ namespace backend
+ namespace cpu_common
+ {
+ 
+-class StaticTensorManager : public backend::ITensorManager
++class StaticTensorManager : public backend::IStaticTensorManager
+ {
+ public:
+   StaticTensorManager(const std::shared_ptr<TensorRegistry> &reg);
+diff --git a/runtime/onert/core/include/compiler/StaticShapeInference.h b/runtime/onert/core/include/compiler/StaticShapeInference.h
+index 379143b..b3391a3 100644
+--- a/runtime/onert/core/include/compiler/StaticShapeInference.h
++++ b/runtime/onert/core/include/compiler/StaticShapeInference.h
+@@ -99,6 +99,7 @@ private:
+   void visit(const ir::operation::LogicalNot &op) override;
+   void visit(const ir::operation::LogicalOr &op) override;
+   void visit(const ir::operation::Logistic &op) override;
++  void visit(const ir::operation::L2Normalization &op) override;
+   void visit(const ir::operation::MatrixBandPart &op) override;
+   void visit(const ir::operation::Max &op) override;
+   void visit(const ir::operation::Min &op) override;
+diff --git a/runtime/onert/core/include/exec/DynamicShapeInference.h b/runtime/onert/core/include/exec/DynamicShapeInference.h
+index 113c348..601c1bf 100644
+--- a/runtime/onert/core/include/exec/DynamicShapeInference.h
++++ b/runtime/onert/core/include/exec/DynamicShapeInference.h
+@@ -72,6 +72,7 @@ public:
+   void visit(const ir::operation::LogicalNot &op) override;
+   void visit(const ir::operation::LogicalOr &op) override;
+   void visit(const ir::operation::Logistic &op) override;
++  void visit(const ir::operation::L2Normalization &op) override;
+   void visit(const ir::operation::MatrixBandPart &op) override;
+   void visit(const ir::operation::Max &op) override;
+   void visit(const ir::operation::Min &op) override;
+diff --git a/runtime/onert/core/include/ir/Operations.Include.h b/runtime/onert/core/include/ir/Operations.Include.h
+index 5fac54e..e3b5d19 100644
+--- a/runtime/onert/core/include/ir/Operations.Include.h
++++ b/runtime/onert/core/include/ir/Operations.Include.h
+@@ -103,3 +103,4 @@
+ #include "ir/operation/BatchMatMul.h"
+ #include "ir/operation/FusedBatchNorm.h"
+ #include "ir/operation/LogSoftmax.h"
++#include "ir/operation/Quantize.h"
+diff --git a/runtime/onert/core/include/ir/Operations.lst b/runtime/onert/core/include/ir/Operations.lst
+index 9d0642f..03a2aa2 100644
+--- a/runtime/onert/core/include/ir/Operations.lst
++++ b/runtime/onert/core/include/ir/Operations.lst
+@@ -106,3 +106,4 @@ OP(MatrixBandPart)
+ OP(BatchMatMul)
+ OP(FusedBatchNorm)
+ OP(LogSoftmax)
++OP(Quantize)
+diff --git a/runtime/onert/core/include/ir/operation/LogSoftmax.h b/runtime/onert/core/include/ir/operation/LogSoftmax.h
+index 26a92d7..391b4ba 100644
+--- a/runtime/onert/core/include/ir/operation/LogSoftmax.h
++++ b/runtime/onert/core/include/ir/operation/LogSoftmax.h
+@@ -48,7 +48,7 @@ public:
+ 
+ public:
+   void accept(OperationVisitor &v) const override;
+-  OpCode opcode() const final { return OpCode::Softmax; }
++  OpCode opcode() const final { return OpCode::LogSoftmax; }
+ 
+ public:
+   const Param &param() const { return _param; }
+diff --git a/runtime/onert/core/include/ir/operation/Pad.h b/runtime/onert/core/include/ir/operation/Pad.h
+index a486061..00481cd 100644
+--- a/runtime/onert/core/include/ir/operation/Pad.h
++++ b/runtime/onert/core/include/ir/operation/Pad.h
+@@ -33,7 +33,7 @@ public:
+   {
+     INPUT = 0,
+     PAD = 1,
+-    // VALUE = 2 Not allow padding value operand yet
++    VALUE = 2
+   };
+ 
+ public:
+diff --git a/runtime/onert/core/include/ir/operation/Quantize.h b/runtime/onert/core/include/ir/operation/Quantize.h
+new file mode 100644
+index 0000000..2533ce4
+--- /dev/null
++++ b/runtime/onert/core/include/ir/operation/Quantize.h
+@@ -0,0 +1,49 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *      http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#ifndef __ONERT_IR_OPERATION_QUANTIZE_H__
++#define __ONERT_IR_OPERATION_QUANTIZE_H__
++
++#include "ir/Operation.h"
++
++namespace onert
++{
++namespace ir
++{
++namespace operation
++{
++
++class Quantize : public Operation
++{
++public:
++  enum Input
++  {
++    INPUT = 0,
++  };
++
++public:
++  Quantize(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs);
++
++public:
++  void accept(OperationVisitor &v) const override;
++  OpCode opcode() const final { return OpCode::Quantize; }
++};
++
++} // namespace operation
++} // namespace ir
++} // namespace onert
++
++#endif // __ONERT_IR_OPERATION_QUANTIZE_H__
+diff --git a/runtime/onert/core/src/backend/controlflow/DynamicTensorManager.cc b/runtime/onert/core/src/backend/controlflow/DynamicTensorManager.cc
+index 32a8041..c374aba 100644
+--- a/runtime/onert/core/src/backend/controlflow/DynamicTensorManager.cc
++++ b/runtime/onert/core/src/backend/controlflow/DynamicTensorManager.cc
+@@ -36,7 +36,7 @@ DynamicTensorManager::DynamicTensorManager(const std::shared_ptr<cpu_common::Ten
+ void DynamicTensorManager::applyShape(const ir::OperandIndex &ind, const ir::Shape &new_shape)
+ {
+   // NOTE Handle user tensors first
+-  auto user_tensor = _user_tensors->getManagedTensor(ind);
++  auto user_tensor = _user_tensors->getNativeTensor(ind);
+   if (user_tensor)
+   {
+     // User tensors cannot be reallocated.
+@@ -47,8 +47,8 @@ void DynamicTensorManager::applyShape(const ir::OperandIndex &ind, const ir::Sha
+     user_tensor->setShape(new_shape);
+   }
+ 
+-  // NOTE Then handle managed tensors
+-  auto tensor = _tensors->getManagedTensor(ind);
++  // NOTE Then handle native tensors
++  auto tensor = _tensors->getNativeTensor(ind);
+   assert(tensor);
+ 
+   bool previously_dynamic = tensor->is_dynamic();
+@@ -101,9 +101,9 @@ void DynamicTensorManager::buildTensor(const ir::OperandIndex &ind,
+                                        const ir::OperandInfo &tensor_info,
+                                        ir::Layout backend_layout)
+ {
+-  assert(_tensors->getManagedTensor(ind) == nullptr);
++  assert(_tensors->getNativeTensor(ind) == nullptr);
+   auto tensor = std::make_shared<cpu_common::Tensor>(tensor_info, backend_layout);
+-  _tensors->setManagedTensor(ind, tensor);
++  _tensors->setNativeTensor(ind, tensor);
+ }
+ 
+ void DynamicTensorManager::planDealloc(ir::OperationIndex op_ind, ir::OperandIndex operand_ind)
+@@ -130,7 +130,7 @@ void DynamicTensorManager::deallocInput(ir::OperationIndex op_ind)
+   auto &input_set = find->second;
+   for (auto input_ind : input_set)
+   {
+-    if (!_tensors->getManagedTensor(input_ind)->is_dynamic())
++    if (!_tensors->getNativeTensor(input_ind)->is_dynamic())
+       continue;
+ 
+     _dynamic_mem_mgr->deallocate(input_ind);
+@@ -141,7 +141,7 @@ void DynamicTensorManager::deallocInput(ir::OperationIndex op_ind)
+ 
+ void DynamicTensorManager::deallocSubgraphOutput(ir::OperandIndex output_ind)
+ {
+-  if (!_tensors->getManagedTensor(output_ind)->is_dynamic())
++  if (!_tensors->getNativeTensor(output_ind)->is_dynamic())
+     return;
+ 
+   _dynamic_mem_mgr->deallocate(output_ind);
+diff --git a/runtime/onert/core/src/backend/controlflow/KernelGenerator.cc b/runtime/onert/core/src/backend/controlflow/KernelGenerator.cc
+index 4b683fb..eb83b7d 100644
+--- a/runtime/onert/core/src/backend/controlflow/KernelGenerator.cc
++++ b/runtime/onert/core/src/backend/controlflow/KernelGenerator.cc
+@@ -81,23 +81,23 @@ void KernelGenerator::visit(const ir::operation::If &node)
+   std::vector<std::shared_ptr<backend::ITensor>> input_tensors;
+   for (const auto input_index : node.getInputs())
+   {
+-    auto input_alloc = getTensor(input_index);
++    auto input_tensor = getTensor(input_index);
+ 
+-    input_tensors.emplace_back(input_alloc);
++    input_tensors.emplace_back(input_tensor);
+   }
+ 
+   std::vector<std::shared_ptr<backend::ITensor>> output_tensors;
+   exec::DynAllocInfoMap outputs_dyn_alloc_info;
+   for (const auto output_index : node.getOutputs())
+   {
+-    auto output_alloc = getTensor(output_index);
++    auto output_tensor = getTensor(output_index);
+ 
+-    output_tensors.emplace_back(output_alloc);
++    output_tensors.emplace_back(output_tensor);
+     const auto output_tensor_builder = getTensorBuilder(output_index);
+     if (output_tensor_builder->supportDynamicTensor())
+     {
+       auto output_dyn_manager = output_tensor_builder->dynamicTensorManager();
+-      outputs_dyn_alloc_info[output_alloc] = exec::DynAllocInfo{output_index, output_dyn_manager};
++      outputs_dyn_alloc_info[output_tensor] = exec::DynAllocInfo{output_index, output_dyn_manager};
+     }
+   }
+ 
+@@ -146,24 +146,24 @@ void KernelGenerator::visit(const ir::operation::While &node)
+   std::vector<std::shared_ptr<backend::ITensor>> input_tensors;
+   for (const auto input_index : node.getInputs())
+   {
+-    auto input_alloc = getTensor(input_index);
++    auto input_tensor = getTensor(input_index);
+ 
+-    input_tensors.emplace_back(input_alloc);
++    input_tensors.emplace_back(input_tensor);
+   }
+ 
+   std::vector<std::shared_ptr<backend::ITensor>> output_tensors;
+   std::unordered_map<std::shared_ptr<ITensor>, exec::DynAllocInfo> outputs_dyn_alloc_info;
+   for (const auto output_index : node.getOutputs())
+   {
+-    auto output_alloc = getTensor(output_index);
++    auto output_tensor = getTensor(output_index);
+ 
+-    output_tensors.emplace_back(output_alloc);
++    output_tensors.emplace_back(output_tensor);
+ 
+     const auto output_tensor_builder = getTensorBuilder(output_index);
+     if (output_tensor_builder->supportDynamicTensor())
+     {
+       auto output_dyn_manager = output_tensor_builder->dynamicTensorManager();
+-      outputs_dyn_alloc_info[output_alloc] = exec::DynAllocInfo{output_index, output_dyn_manager};
++      outputs_dyn_alloc_info[output_tensor] = exec::DynAllocInfo{output_index, output_dyn_manager};
+     }
+   }
+ 
+@@ -199,7 +199,7 @@ KernelGenerator::getTensorBuilder(const ir::OperandIndex &index)
+   for (auto tensor_builder : _tensor_builder_set)
+   {
+     auto reg = tensor_builder->tensorRegistry();
+-    auto tensor = reg ? reg->getManagedITensor(index) : tensor_builder->tensorAt(index);
++    auto tensor = reg ? reg->getNativeITensor(index) : tensor_builder->tensorAt(index);
+     if (tensor)
+     {
+       ret = tensor_builder;
+diff --git a/runtime/onert/core/src/backend/controlflow/TensorBuilder.cc b/runtime/onert/core/src/backend/controlflow/TensorBuilder.cc
+index 16cd3ec..5bddb91 100644
+--- a/runtime/onert/core/src/backend/controlflow/TensorBuilder.cc
++++ b/runtime/onert/core/src/backend/controlflow/TensorBuilder.cc
+@@ -92,7 +92,7 @@ void TensorBuilder::allocate()
+ std::shared_ptr<ITensor> TensorBuilder::tensorAt(const ir::OperandIndex &ind)
+ {
+   // NOTE Find from User Tensor Registry first
+-  // FIXME There may be both user tensor and managed tensor for a `ind` which is a waste
++  // FIXME There may be both user tensor and native tensor for a `ind` which is a waste
+   auto user_tensor = _user_tensor_reg->getITensor(ind);
+   auto tensor = _tensor_reg->getITensor(ind);
+   if (user_tensor)
+@@ -107,7 +107,7 @@ void TensorBuilder::iterate(const IterateFunction &fn) { _static_tensor_mgr->ite
+ 
+ std::shared_ptr<cpu_common::Tensor> TensorBuilder::at(const ir::OperandIndex &ind)
+ {
+-  return _tensor_reg->getManagedTensor(ind);
++  return _tensor_reg->getNativeTensor(ind);
+ }
+ 
+ std::unique_ptr<ITensorManager> TensorBuilder::releaseStaticTensorManager(void)
+@@ -123,7 +123,7 @@ std::unique_ptr<ITensorManager> TensorBuilder::releaseDynamicTensorManager(void)
+ void TensorBuilder::setUserTensor(const ir::OperandIndex &ind,
+                                   const std::shared_ptr<UserTensor> &tensor)
+ {
+-  _user_tensor_reg->setManagedTensor(ind, tensor);
++  _user_tensor_reg->setNativeTensor(ind, tensor);
+ }
+ 
+ } // namespace controlflow
+diff --git a/runtime/onert/core/src/backend/controlflow/UserTensor.h b/runtime/onert/core/src/backend/controlflow/UserTensor.h
+index ce94ea0..b9b2d52 100644
+--- a/runtime/onert/core/src/backend/controlflow/UserTensor.h
++++ b/runtime/onert/core/src/backend/controlflow/UserTensor.h
+@@ -68,6 +68,7 @@ public:
+   void set_dynamic() override { _dynamic = true; }
+   ir::Shape getShape() const override { return _info.shape(); }
+   void setShape(const ir::Shape &new_shape) override { _info.shape(new_shape); }
++  bool is_constant() const override { return false; }
+ 
+ private:
+   ir::OperandInfo _info;
+diff --git a/runtime/onert/core/src/backend/cpu_common/DynamicTensorManager.cc b/runtime/onert/core/src/backend/cpu_common/DynamicTensorManager.cc
+index 0ccf700..ede403b 100644
+--- a/runtime/onert/core/src/backend/cpu_common/DynamicTensorManager.cc
++++ b/runtime/onert/core/src/backend/cpu_common/DynamicTensorManager.cc
+@@ -35,7 +35,7 @@ void DynamicTensorManager::applyShape(const ir::OperandIndex &ind, const ir::Sha
+ {
+   VERBOSE_F() << ind << std::endl;
+ 
+-  auto tensor = _tensors->getManagedTensor(ind);
++  auto tensor = _tensors->getNativeTensor(ind);
+   assert(tensor);
+ 
+   bool previously_dynamic = tensor->is_dynamic();
+@@ -88,9 +88,9 @@ void DynamicTensorManager::buildTensor(const ir::OperandIndex &ind,
+                                        const ir::OperandInfo &tensor_info,
+                                        ir::Layout backend_layout)
+ {
+-  assert(_tensors->getManagedTensor(ind) == nullptr);
++  assert(_tensors->getNativeTensor(ind) == nullptr);
+   auto tensor = std::make_shared<Tensor>(tensor_info, backend_layout);
+-  _tensors->setManagedTensor(ind, tensor);
++  _tensors->setNativeTensor(ind, tensor);
+ }
+ 
+ void DynamicTensorManager::planDealloc(ir::OperationIndex op_ind, ir::OperandIndex operand_ind)
+@@ -117,7 +117,7 @@ void DynamicTensorManager::deallocInput(ir::OperationIndex op_ind)
+   auto &input_set = find->second;
+   for (auto input_ind : input_set)
+   {
+-    auto *tensor = _tensors->getManagedTensor(input_ind).get();
++    auto *tensor = _tensors->getNativeTensor(input_ind).get();
+     if (!tensor->is_dynamic())
+       continue;
+ 
+@@ -131,7 +131,7 @@ void DynamicTensorManager::deallocInput(ir::OperationIndex op_ind)
+ 
+ void DynamicTensorManager::deallocSubgraphOutput(ir::OperandIndex output_ind)
+ {
+-  auto *tensor = _tensors->getManagedTensor(output_ind).get();
++  auto *tensor = _tensors->getNativeTensor(output_ind).get();
+   if (!tensor->is_dynamic())
+     return;
+ 
+diff --git a/runtime/onert/core/src/backend/cpu_common/StaticTensorManager.cc b/runtime/onert/core/src/backend/cpu_common/StaticTensorManager.cc
+index 47bea35..8604542 100644
+--- a/runtime/onert/core/src/backend/cpu_common/StaticTensorManager.cc
++++ b/runtime/onert/core/src/backend/cpu_common/StaticTensorManager.cc
+@@ -33,7 +33,7 @@ StaticTensorManager::StaticTensorManager(const std::shared_ptr<TensorRegistry> &
+ 
+ void StaticTensorManager::allocateConsts(void)
+ {
+-  for (auto &pair : _tensors->managed_tensors())
++  for (auto &pair : _tensors->native_tensors())
+   {
+     const auto &ind = pair.first;
+     auto tensor = pair.second;
+@@ -42,9 +42,9 @@ void StaticTensorManager::allocateConsts(void)
+       auto mem_alloc = _const_mgr->allocate(ind, tensor->total_size());
+       tensor->setBuffer(mem_alloc);
+       auto buffer = mem_alloc->base();
+-      VERBOSE(CPU_StaticTensorManager) << "CONSTANT TENSOR(#" << ind.value()
+-                                       << "): " << static_cast<void *>(buffer)
+-                                       << "size : " << tensor->total_size() << std::endl;
++      VERBOSE(CPU_COMMON_StaticTensorManager) << "CONSTANT TENSOR(#" << ind.value()
++                                              << "): " << static_cast<void *>(buffer)
++                                              << "size : " << tensor->total_size() << std::endl;
+     }
+   }
+ }
+@@ -53,7 +53,7 @@ void StaticTensorManager::allocateNonconsts(void)
+ {
+   _nonconst_mgr->allocate();
+ 
+-  for (auto &pair : _tensors->managed_tensors())
++  for (auto &pair : _tensors->native_tensors())
+   {
+     const auto &ind = pair.first;
+     auto tensor = pair.second;
+@@ -62,8 +62,8 @@ void StaticTensorManager::allocateNonconsts(void)
+       auto *buffer = _nonconst_mgr->getBuffer(ind);
+       tensor->setBuffer(buffer);
+ 
+-      VERBOSE(CPU_StaticTensorManager) << "TENSOR(#" << ind.value()
+-                                       << "): " << static_cast<void *>(buffer) << std::endl;
++      VERBOSE(CPU_COMMON_StaticTensorManager) << "TENSOR(#" << ind.value()
++                                              << "): " << static_cast<void *>(buffer) << std::endl;
+     }
+   }
+ }
+@@ -76,18 +76,18 @@ void StaticTensorManager::buildTensor(const ir::OperandIndex &ind,
+                                       const ir::OperandInfo &tensor_info, ir::Layout backend_layout,
+                                       bool as_const)
+ {
+-  assert(!_tensors->getManagedTensor(ind));
++  assert(!_tensors->getNativeTensor(ind));
+   auto tensor = std::make_shared<Tensor>(tensor_info, backend_layout);
+-  _tensors->setManagedTensor(ind, tensor);
++  _tensors->setNativeTensor(ind, tensor);
+   _as_constants[ind] = as_const;
+ }
+ 
+ void StaticTensorManager::claimPlan(const ir::OperandIndex &ind, uint32_t size)
+ {
+-  assert(_tensors->getManagedTensor(ind));
++  assert(_tensors->getNativeTensor(ind));
+ 
+   // This method is called only when a tensor has proper shape
+-  assert(!_tensors->getManagedTensor(ind)->is_dynamic());
++  assert(!_tensors->getNativeTensor(ind)->is_dynamic());
+ 
+   if (!_as_constants[ind])
+     _nonconst_mgr->claimPlan(ind, size);
+@@ -95,10 +95,10 @@ void StaticTensorManager::claimPlan(const ir::OperandIndex &ind, uint32_t size)
+ 
+ void StaticTensorManager::releasePlan(const ir::OperandIndex &ind)
+ {
+-  assert(_tensors->getManagedTensor(ind));
++  assert(_tensors->getNativeTensor(ind));
+ 
+   // This method is called only when a tensor has proper shape
+-  assert(!_tensors->getManagedTensor(ind)->is_dynamic());
++  assert(!_tensors->getNativeTensor(ind)->is_dynamic());
+ 
+   if (!_as_constants[ind])
+     _nonconst_mgr->releasePlan(ind);
+@@ -106,7 +106,7 @@ void StaticTensorManager::releasePlan(const ir::OperandIndex &ind)
+ 
+ void StaticTensorManager::iterate(const std::function<void(const ir::OperandIndex &)> &fn)
+ {
+-  for (const auto &it : _tensors->managed_tensors())
++  for (const auto &it : _tensors->native_tensors())
+     fn(it.first);
+ }
+ 
+diff --git a/runtime/onert/core/src/compiler/ExecutorFactory.cc b/runtime/onert/core/src/compiler/ExecutorFactory.cc
+index f3f69ad..8439b6a 100644
+--- a/runtime/onert/core/src/compiler/ExecutorFactory.cc
++++ b/runtime/onert/core/src/compiler/ExecutorFactory.cc
+@@ -201,18 +201,35 @@ ExecutorFactory::initializeModelIOTensors(ir::LoweredGraph &lowered_graph,
+     // Add tensor to controlflow TensorRegistry.
+     cf_tensor_builder->setUserTensor(ind, tensor);
+     ret.push_back(tensor);
+-
+-    // Set other tensors as external tensors
+-    for (auto &tensor_builder : tensor_builders)
+-    {
+-      // FIXME This is a workaround registering all user tensors to all backends
+-      // FIXME Handle when it is failed
+-      tensor_builder->setExternalTensor(ind, tensor);
+-    }
+   }
+   return ret;
+ }
+ 
++void ExecutorFactory::prepareExternalTensors(ir::LoweredGraph &lowered_graph,
++                                             TensorBuilders &tensor_builders)
++{
++  lowered_graph.op_seqs().iterate(
++      [&](const ir::OpSequenceIndex &op_seq_index, const ir::OpSequence &op_seq) {
++        auto lower_info = lowered_graph.getLowerInfo(op_seq_index);
++        auto &backend_ctx = lowered_graph.backend_contexts().at(lower_info->backend());
++        for (auto ind : (op_seq.getInputs() + op_seq.getOutputs()) | ir::Remove::DUPLICATED |
++                            ir::Remove::UNDEFINED)
++        {
++          // If an OpSequence input/output tensor does not have a own tensor object,
++          // it must be using external tensors, so find the tensor from other tensor builders and
++          // set the tensor to this tensor builder if portable
++          if (!backend_ctx->tensor_builder->tensorAt(ind))
++          {
++            auto tensor = tensor_builders.getITensor(ind);
++            assert(tensor); // The tensor must have been created in one of TensorBuilders
++            auto ptensor = std::dynamic_pointer_cast<backend::IPortableTensor>(tensor);
++            if (ptensor)
++              backend_ctx->tensor_builder->setMigrantTensor(ind, ptensor);
++          }
++        }
++      });
++}
++
+ exec::IExecutor *
+ ExecutorFactory::createLinearExecutor(std::unique_ptr<ir::LoweredGraph> lowered_graph,
+                                       const compiler::CompilerOptions &options,
+@@ -265,6 +282,8 @@ ExecutorFactory::createLinearExecutor(std::unique_ptr<ir::LoweredGraph> lowered_
+     tensor_builder->prepare();
+   }
+ 
++  prepareExternalTensors(*lowered_graph, tensor_builders);
++
+   ExecutionBuilder builder;
+ 
+   // Generate kernels
+@@ -367,6 +386,8 @@ exec::IExecutor *ExecutorFactory::createDataflowExecutor(
+     tensor_builder->prepare();
+   }
+ 
++  prepareExternalTensors(*lowered_graph, tensor_builders);
++
+   ExecutionBuilder builder;
+ 
+   // Generate kernels
+diff --git a/runtime/onert/core/src/compiler/ExecutorFactory.h b/runtime/onert/core/src/compiler/ExecutorFactory.h
+index 1e82b98..418e5a7 100644
+--- a/runtime/onert/core/src/compiler/ExecutorFactory.h
++++ b/runtime/onert/core/src/compiler/ExecutorFactory.h
+@@ -22,6 +22,7 @@
+ #include "backend/ITensor.h"
+ #include "exec/IExecutor.h"
+ #include "ir/LoweredGraph.h"
++#include "TensorBuilders.h"
+ 
+ namespace onert
+ {
+@@ -48,6 +49,8 @@ private:
+   static std::vector<std::shared_ptr<backend::ITensor>>
+   initializeModelIOTensors(ir::LoweredGraph &lowered_graph,
+                            const ir::OperandIndexSequence &indices);
++  static void prepareExternalTensors(ir::LoweredGraph &lowered_graph,
++                                     TensorBuilders &tensor_builders);
+   static exec::IExecutor *
+   createLinearExecutor(std::unique_ptr<ir::LoweredGraph> lowered_graph,
+                        const compiler::CompilerOptions &options,
+diff --git a/runtime/onert/core/src/compiler/HEScheduler.h b/runtime/onert/core/src/compiler/HEScheduler.h
+index f507539..d8ceca9 100644
+--- a/runtime/onert/core/src/compiler/HEScheduler.h
++++ b/runtime/onert/core/src/compiler/HEScheduler.h
+@@ -51,16 +51,12 @@ public:
+    * @param[in] backend_resolver backend resolver
+    */
+   HEScheduler(const backend::BackendContexts &backend_contexts, const CompilerOptions &options)
+-      : _backend_contexts{backend_contexts}, _is_supported{}, _backends_avail_time{}, _ops_eft{},
++      : _is_supported{}, _backends_avail_time{}, _ops_eft{},
+         _op_to_rank{std::make_shared<ir::OperationIndexMap<int64_t>>()},
+         _is_profiling_mode{options.he_profiling_mode},
+         _is_linear_exec{options.executor == "Linear"},
+         _is_parallel_exec{options.executor == "Parallel"}
+   {
+-    // Workaround to avoid unused-private-field warning
+-    // TODO use _backend_contexts and remove workaround
+-    (void)_backend_contexts;
+-
+     for (auto &entry : backend_contexts)
+     {
+       _all_backends.push_back(entry.first);
+@@ -165,7 +161,6 @@ private:
+   // whether it should assign these backends to these nodes:
+   // * It stores false for unsupported nodes
+   // * During rank calculation with enabled profiling mode it stores true for supported nodes
+-  const backend::BackendContexts &_backend_contexts;
+   std::unordered_map<const backend::Backend *, std::unordered_map<std::string, bool>> _is_supported;
+   // Finishing and starting time of each backend
+   std::unordered_map<const backend::Backend *, std::map<int64_t, int64_t>> _backends_avail_time;
+@@ -175,8 +170,7 @@ private:
+   std::unique_ptr<compiler::BackendResolver> _backend_resolver;
+   std::unique_ptr<exec::ExecTime> _exec_time;
+   const ir::Graph *_graph{nullptr};
+-  std::vector<const backend::Backend *>
+-      _all_backends; // TODO Remove this and use _backend_contexts instead
++  std::vector<const backend::Backend *> _all_backends;
+   const backend::Backend *_cpu_backend{nullptr}; // TODO Change this to controlflow_backend
+   bool _is_profiling_mode;
+   bool _is_linear_exec;
+diff --git a/runtime/onert/core/src/compiler/OperationValidator.cc b/runtime/onert/core/src/compiler/OperationValidator.cc
+index 5c545ae..fa5ee27 100644
+--- a/runtime/onert/core/src/compiler/OperationValidator.cc
++++ b/runtime/onert/core/src/compiler/OperationValidator.cc
+@@ -41,6 +41,21 @@ OperationValidator::OperationValidator(const ir::Graph &graph)
+ {
+ }
+ 
++void OperationValidator::checkUnaryOp(const ir::Operation &node)
++{
++  const auto output_index{node.getOutputs().at(0)};
++  const auto input_index{node.getInputs().at(0)};
++
++  // Check if I/O types match
++  OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == _ctx.at(input_index).typeInfo().type());
++
++  if (_ctx.at(output_index).info().isDynamic())
++    return;
++
++  // Check if I/O shapes match
++  OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
++}
++
+ void OperationValidator::operator()()
+ {
+   // There is no reason for each subgraph to have subgraphs since compiler has subgraphs when
+@@ -53,16 +68,7 @@ void OperationValidator::operator()()
+       [&](const ir::OperationIndex &, const ir::Operation &node) { node.accept(*this); });
+ }
+ 
+-void OperationValidator::visit(const ir::operation::Abs &node)
+-{
+-  const auto output_index{node.getOutputs().at(0)};
+-  if (_ctx.at(output_index).info().isDynamic())
+-    return;
+-
+-  const auto input_index{node.getInputs().at(0)};
+-
+-  OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
+-}
++void OperationValidator::visit(const ir::operation::Abs &node) { checkUnaryOp(node); }
+ 
+ void OperationValidator::visit(const ir::operation::AvgPool2D &node)
+ {
+@@ -292,17 +298,7 @@ void OperationValidator::visit(const ir::operation::RNN &node)
+               num_units == _ctx.at(hidden_state_out_index).shape().dim(1));
+ }
+ 
+-void OperationValidator::visit(const ir::operation::Round &node)
+-{
+-  const auto output_index{node.getOutputs().at(0)};
+-  const auto input_index{node.getInputs().at(ir::operation::Round::Input::INPUT)};
+-
+-  OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == _ctx.at(input_index).typeInfo().type());
+-
+-  if (_ctx.at(output_index).info().isDynamic())
+-    return;
+-  OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
+-}
++void OperationValidator::visit(const ir::operation::Round &node) { checkUnaryOp(node); }
+ 
+ void OperationValidator::visit(const ir::operation::SpaceToBatchND &node)
+ {
+@@ -393,17 +389,7 @@ void OperationValidator::visit(const ir::operation::EmbeddingLookup &node)
+   }
+ }
+ 
+-void OperationValidator::visit(const ir::operation::Exp &node)
+-{
+-  const auto output_index{node.getOutputs().at(0)};
+-  const auto input_index{node.getInputs().at(ir::operation::Exp::Input::INPUT)};
+-
+-  OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == _ctx.at(input_index).typeInfo().type());
+-
+-  if (_ctx.at(output_index).info().isDynamic())
+-    return;
+-  OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
+-}
++void OperationValidator::visit(const ir::operation::Exp &node) { checkUnaryOp(node); }
+ 
+ void OperationValidator::visit(const ir::operation::ExpandDims &node)
+ {
+@@ -419,17 +405,7 @@ void OperationValidator::visit(const ir::operation::ExpandDims &node)
+   OP_REQUIRES(_ctx.at(axis_index).shape().rank() <= 1);
+ }
+ 
+-void OperationValidator::visit(const ir::operation::Floor &node)
+-{
+-  const auto output_index{node.getOutputs().at(0)};
+-  const auto input_index{node.getInputs().at(ir::operation::Floor::Input::INPUT)};
+-
+-  OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == _ctx.at(input_index).typeInfo().type());
+-
+-  if (_ctx.at(output_index).info().isDynamic())
+-    return;
+-  OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
+-}
++void OperationValidator::visit(const ir::operation::Floor &node) { checkUnaryOp(node); }
+ 
+ void OperationValidator::visit(const ir::operation::HashtableLookup &node)
+ {
+@@ -789,6 +765,25 @@ void OperationValidator::visit(const ir::operation::LSTM &node)
+   }
+ }
+ 
++void OperationValidator::visit(const ir::operation::L2Normalization &node)
++{
++  const auto ofm_index{node.getOutputs().at(0)};
++  if (_ctx.at(ofm_index).info().isDynamic())
++    return;
++
++  const auto ifm_index{node.getInputs().at(ir::operation::L2Normalization::Input::INPUT)};
++
++  auto ifm_shape = _ctx.at(ifm_index).shape();
++  auto ofm_shape = _ctx.at(ofm_index).shape();
++
++  OP_REQUIRES(ifm_shape.rank() == ofm_shape.rank());
++
++  for (auto i = 0; i < ifm_shape.rank(); i++)
++  {
++    OP_REQUIRES(ifm_shape.dim(i) == ofm_shape.dim(i));
++  }
++}
++
+ void OperationValidator::visit(const ir::operation::Unpack &node)
+ {
+   const auto num{node.param().num};
+@@ -904,35 +899,11 @@ void OperationValidator::visit(const ir::operation::Split &node)
+   OP_REQUIRES(_ctx.at(input_index).shape().dim(axis) % num_splits == 0);
+ }
+ 
+-void OperationValidator::visit(const ir::operation::Cos &node)
+-{
+-  const auto output_index{node.getOutputs().at(0)};
+-  if (_ctx.at(output_index).info().isDynamic())
+-    return;
+-
+-  const auto input_index{node.getInputs().at(0)};
+-  OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
+-}
+-
+-void OperationValidator::visit(const ir::operation::Sin &node)
+-{
+-  const auto output_index{node.getOutputs().at(0)};
+-  if (_ctx.at(output_index).info().isDynamic())
+-    return;
++void OperationValidator::visit(const ir::operation::Cos &node) { checkUnaryOp(node); }
+ 
+-  const auto input_index{node.getInputs().at(0)};
+-  OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
+-}
++void OperationValidator::visit(const ir::operation::Sin &node) { checkUnaryOp(node); }
+ 
+-void OperationValidator::visit(const ir::operation::RSQRT &node)
+-{
+-  const auto output_index{node.getOutputs().at(0)};
+-  if (_ctx.at(output_index).info().isDynamic())
+-    return;
+-
+-  const auto input_index{node.getInputs().at(0)};
+-  OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
+-}
++void OperationValidator::visit(const ir::operation::RSQRT &node) { checkUnaryOp(node); }
+ 
+ void OperationValidator::visit(const ir::operation::Shape &node)
+ {
+@@ -972,35 +943,11 @@ void OperationValidator::visit(const ir::operation::While &node)
+   // TODO Add to validate with subgraphs
+ }
+ 
+-void OperationValidator::visit(const ir::operation::Neg &node)
+-{
+-  const auto output_index{node.getOutputs().at(0)};
+-  if (_ctx.at(output_index).info().isDynamic())
+-    return;
++void OperationValidator::visit(const ir::operation::Neg &node) { checkUnaryOp(node); }
+ 
+-  const auto input_index{node.getInputs().at(0)};
+-  OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
+-}
++void OperationValidator::visit(const ir::operation::Log &node) { checkUnaryOp(node); }
+ 
+-void OperationValidator::visit(const ir::operation::Log &node)
+-{
+-  const auto output_index{node.getOutputs().at(0)};
+-  if (_ctx.at(output_index).info().isDynamic())
+-    return;
+-
+-  const auto input_index{node.getInputs().at(0)};
+-  OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
+-}
+-
+-void OperationValidator::visit(const ir::operation::LogicalNot &node)
+-{
+-  const auto output_index{node.getOutputs().at(0)};
+-  if (_ctx.at(output_index).info().isDynamic())
+-    return;
+-
+-  const auto input_index{node.getInputs().at(0)};
+-  OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
+-}
++void OperationValidator::visit(const ir::operation::LogicalNot &node) { checkUnaryOp(node); }
+ 
+ void OperationValidator::visit(const ir::operation::SquaredDifference &node)
+ {
+@@ -1118,5 +1065,25 @@ void OperationValidator::visit(const ir::operation::LogSoftmax &node)
+ 
+   OP_REQUIRES(_ctx.at(output_index).shape().rank() == _ctx.at(input_index).shape().rank());
+ }
++
++void OperationValidator::visit(const ir::operation::Quantize &node)
++{
++  VERBOSE(Quantize) << "Configure Quantize operation" << std::endl;
++
++  OP_REQUIRES(node.getInputs().size() == 1);
++  OP_REQUIRES(node.getOutputs().size() == 1);
++
++  const auto input_index{node.getInputs().at(0)};
++  const auto output_index{node.getOutputs().at(0)};
++
++  OP_REQUIRES(_ctx.at(input_index).typeInfo().type() == ir::DataType::FLOAT32);
++
++  if (_ctx.at(output_index).info().isDynamic())
++    return;
++
++  OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == ir::DataType::QUANT_UINT8_ASYMM);
++
++  OP_REQUIRES(_ctx.at(output_index).shape().rank() == _ctx.at(input_index).shape().rank());
++}
+ } // namespace compiler
+ } // namespace onert
+diff --git a/runtime/onert/core/src/compiler/OperationValidator.h b/runtime/onert/core/src/compiler/OperationValidator.h
+index 6ceafe8..55a4dd5 100644
+--- a/runtime/onert/core/src/compiler/OperationValidator.h
++++ b/runtime/onert/core/src/compiler/OperationValidator.h
+@@ -70,6 +70,7 @@ public:
+   void visit(const ir::operation::DepthToSpace &node) override;
+   void visit(const ir::operation::Pack &node) override;
+   void visit(const ir::operation::LSTM &node) override;
++  void visit(const ir::operation::L2Normalization &node) override;
+   void visit(const ir::operation::Unpack &node) override;
+   void visit(const ir::operation::Pad &node) override;
+   void visit(const ir::operation::Min &node) override;
+@@ -93,9 +94,10 @@ public:
+   void visit(const ir::operation::Range &node) override;
+   void visit(const ir::operation::MatrixBandPart &node) override;
+   void visit(const ir::operation::LogSoftmax &node) override;
++  void visit(const ir::operation::Quantize &node) override;
+ 
+ private:
+-  void checkReduceOp(const ir::OperandIndex input_index, const ir::OperandIndex output_index);
++  void checkUnaryOp(const ir::Operation &node);
+ 
+ private:
+   // TODO Remove _ctx field
+diff --git a/runtime/onert/core/src/compiler/StaticShapeInference.cc b/runtime/onert/core/src/compiler/StaticShapeInference.cc
+index 5a58f2e..66de599 100644
+--- a/runtime/onert/core/src/compiler/StaticShapeInference.cc
++++ b/runtime/onert/core/src/compiler/StaticShapeInference.cc
+@@ -497,6 +497,11 @@ void StaticShapeInferer::visit(const ir::operation::Logistic &op)
+   handleSimpleUnaryOp(op, op.getInputs().at(ir::operation::Logistic::Input::INPUT));
+ }
+ 
++void StaticShapeInferer::visit(const ir::operation::L2Normalization &op)
++{
++  handleSimpleUnaryOp(op, op.getInputs().at(ir::operation::L2Normalization::Input::INPUT));
++}
++
+ void StaticShapeInferer::visit(const ir::operation::MatrixBandPart &op)
+ {
+   handleSimpleUnaryOp(op, op.getInputs().at(ir::operation::MatrixBandPart::Input::INPUT));
+diff --git a/runtime/onert/core/src/compiler/TensorBuilders.h b/runtime/onert/core/src/compiler/TensorBuilders.h
+index 4bb7413..c0a1ebc 100644
+--- a/runtime/onert/core/src/compiler/TensorBuilders.h
++++ b/runtime/onert/core/src/compiler/TensorBuilders.h
+@@ -23,6 +23,7 @@
+ #include "backend/Backend.h"
+ #include "backend/controlflow/Config.h"
+ #include "backend/controlflow/TensorBuilder.h"
++#include "util/logging.h"
+ 
+ namespace onert
+ {
+@@ -66,6 +67,17 @@ public:
+     return _cf_tensor_builder;
+   }
+ 
++  std::shared_ptr<backend::ITensor> getITensor(ir::OperandIndex ind)
++  {
++    for (auto &tensor_builder : _tensor_builders)
++    {
++      auto tensor = tensor_builder->tensorAt(ind);
++      if (tensor)
++        return tensor;
++    }
++    return nullptr;
++  }
++
+ private:
+   std::unordered_set<std::shared_ptr<backend::ITensorBuilder>> _tensor_builders;
+   std::shared_ptr<backend::controlflow::TensorBuilder> _cf_tensor_builder;
+diff --git a/runtime/onert/core/src/exec/DynamicShapeInference.cc b/runtime/onert/core/src/exec/DynamicShapeInference.cc
+index 1b82029..28e92ba 100644
+--- a/runtime/onert/core/src/exec/DynamicShapeInference.cc
++++ b/runtime/onert/core/src/exec/DynamicShapeInference.cc
+@@ -442,6 +442,11 @@ void DynamicShapeInferer::visit(const ir::operation::Logistic &op)
+   handleSimpleUnaryOp(op, op.getInputs().at(ir::operation::Logistic::INPUT));
+ }
+ 
++void DynamicShapeInferer::visit(const ir::operation::L2Normalization &op)
++{
++  handleSimpleUnaryOp(op, op.getInputs().at(ir::operation::L2Normalization::INPUT));
++}
++
+ void DynamicShapeInferer::visit(const ir::operation::MatrixBandPart &op)
+ {
+   handleSimpleUnaryOp(op, op.getInputs().at(ir::operation::MatrixBandPart::INPUT));
+diff --git a/runtime/onert/core/src/exec/ExecutorBase.cc b/runtime/onert/core/src/exec/ExecutorBase.cc
+index a7409b9..864ccb3 100644
+--- a/runtime/onert/core/src/exec/ExecutorBase.cc
++++ b/runtime/onert/core/src/exec/ExecutorBase.cc
+@@ -46,7 +46,7 @@ ExecutorBase::ExecutorBase(std::unique_ptr<ir::LoweredGraph> &&lowered_graph,
+         {
+           auto tensor_registry = tensor_builder->tensorRegistry();
+           assert(tensor_registry);
+-          tensor = tensor_registry->getManagedITensor(ind);
++          tensor = tensor_registry->getNativeITensor(ind);
+           if (tensor != nullptr)
+           {
+             if (tensor_builder->supportDynamicTensor())
+@@ -71,7 +71,7 @@ ExecutorBase::ExecutorBase(std::unique_ptr<ir::LoweredGraph> &&lowered_graph,
+         {
+           auto tensor_registry = tensor_builder->tensorRegistry();
+           assert(tensor_registry);
+-          tensor = tensor_registry->getManagedITensor(ind);
++          tensor = tensor_registry->getNativeITensor(ind);
+           if (tensor != nullptr)
+           {
+             if (tensor_builder->supportDynamicTensor())
+diff --git a/runtime/onert/core/src/interp/operations/Pad.cc b/runtime/onert/core/src/interp/operations/Pad.cc
+index d2e3627..c8dce69 100644
+--- a/runtime/onert/core/src/interp/operations/Pad.cc
++++ b/runtime/onert/core/src/interp/operations/Pad.cc
+@@ -69,8 +69,8 @@ void invoke(const ITensor *input_tensor, const ITensor *pad_tensor, const ITenso
+   const int32_t *pad_ptr = reinterpret_cast<const int32_t *>(pad_buffer);
+   float *output_ptr = reinterpret_cast<float *>(output_buffer);
+ 
+-  nnfw::cker::Pad(pad_ptr, pad_rank, cker_input_shape, input_ptr, cker_output_shape, output_ptr,
+-                  nullptr);
++  nnfw::cker::Pad<float>(pad_ptr, pad_rank, cker_input_shape, input_ptr, cker_output_shape,
++                         output_ptr, nullptr);
+ }
+ 
+ void invokePad(const ExecEnv *env, const ir::Operation &node)
+diff --git a/runtime/onert/core/src/ir/LoweredGraph.cc b/runtime/onert/core/src/ir/LoweredGraph.cc
+index 6e93a23..f138089 100644
+--- a/runtime/onert/core/src/ir/LoweredGraph.cc
++++ b/runtime/onert/core/src/ir/LoweredGraph.cc
+@@ -122,9 +122,6 @@ LoweredGraph::LoweredGraph(const Graph &graph, const compiler::CompilerOptions &
+ 
+     pass::PermutationInsertionPass pi_pass(*this);
+     pi_pass.run();
+-    // Implemented code no longer works.
+-    // pass::PermutationEliminationPass pe_pass(*this);
+-    // pe_pass.run();
+ 
+     _op_seqs.dump("merged and sorted operations with permutation", _graph.operations());
+   }
+diff --git a/runtime/onert/core/src/ir/operation/Quantize.cc b/runtime/onert/core/src/ir/operation/Quantize.cc
+new file mode 100644
+index 0000000..0e3d5b6
+--- /dev/null
++++ b/runtime/onert/core/src/ir/operation/Quantize.cc
+@@ -0,0 +1,37 @@
++/*
++ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *      http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#include "ir/operation/Quantize.h"
++
++#include "ir/OperationVisitor.h"
++
++namespace onert
++{
++namespace ir
++{
++namespace operation
++{
++
++void Quantize::accept(OperationVisitor &v) const { v.visit(*this); }
++
++Quantize::Quantize(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs)
++    : Operation{OperandConstraint::createExact(2u), inputs, outputs}
++{
++}
++
++} // namespace operation
++} // namespace ir
++} // namespace onert
+diff --git a/runtime/onert/core/src/ir/pass/PermutationEliminationPass.cc b/runtime/onert/core/src/ir/pass/PermutationEliminationPass.cc
+deleted file mode 100644
+index 9e0291e..0000000
+--- a/runtime/onert/core/src/ir/pass/PermutationEliminationPass.cc
++++ /dev/null
+@@ -1,195 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-#include "PermutationEliminationPass.h"
+-
+-#include "ir/Operand.h"
+-#include "ir/operand/LowerInfo.h"
+-#include "ir/Graph.h"
+-#include "backend/IConfig.h"
+-#include "util/logging.h"
+-
+-namespace onert
+-{
+-namespace ir
+-{
+-namespace pass
+-{
+-void PermutationEliminationPass::callback(const OperandIndex &inp_index, Operand &object)
+-{
+-  if (_graph.getInputs().contains(inp_index))
+-  {
+-    eliminateInput(inp_index, object);
+-  }
+-  else if (_graph.getOutputs().contains(inp_index))
+-  {
+-    eliminateOutput(inp_index, object);
+-  }
+-}
+-
+-void PermutationEliminationPass::eliminateInput(const OperandIndex &inp_index, Operand &object)
+-{
+-  auto &model_inputs = _graph.getInputs();
+-
+-  // get uses of the model's given input
+-  auto uses = object.getUses();
+-
+-  // input must be used just by permutation
+-  if (uses.size() != 1)
+-  {
+-    return;
+-  }
+-
+-  for (auto input_use : uses)
+-  {
+-    auto &perm_operation = _graph.operations().at(input_use);
+-    auto perm_inputs = perm_operation.getInputs();
+-
+-    auto perm_outputs = perm_operation.getOutputs();
+-
+-    if (!isPermuteLayerToEliminate(perm_inputs, perm_outputs, true))
+-    {
+-      return;
+-    }
+-
+-    assert(perm_inputs.at(0) == inp_index);
+-
+-    VERBOSE(PermutationEliminationPass::EliminateInput) << "remove NHWC_TO_NCHW permutation\n";
+-
+-    // set model's new input, which was output of permutation
+-    model_inputs.replace(inp_index, perm_outputs.at(0));
+-
+-    // remove model's input, which is also input of permutation
+-    _graph.removeOperand(inp_index);
+-
+-    // remove permutation operation
+-    assert(_lowered_graph.op_seqs().containsOperation(input_use));
+-    auto op_seq_idx = _lowered_graph.op_seqs().getOperation(input_use);
+-    _lowered_graph.op_seqs().remove(op_seq_idx);
+-    _graph.operations().remove(input_use);
+-
+-    VERBOSE(PermutationEliminationPass::EliminateInput)
+-        << inp_index.value() << " is model's input and is removed. New input is "
+-        << perm_outputs.at(0).value() << "\n"
+-        << input_use.value() << " is removed permutation operation\n";
+-  }
+-}
+-
+-void PermutationEliminationPass::eliminateOutput(const OperandIndex &out_index, Operand &object)
+-{
+-  auto &model_outputs = _graph.getOutputs();
+-
+-  // get defs of the model's given output
+-  auto defs = object.getDef();
+-
+-  // output must use just permutation
+-  if (defs.size() != 1)
+-  {
+-    return;
+-  }
+-
+-  for (auto output_def : defs)
+-  {
+-    auto &perm_operation = _graph.operations().at(output_def);
+-    auto perm_outputs = perm_operation.getOutputs();
+-
+-    auto perm_inputs = perm_operation.getInputs();
+-    if (!isPermuteLayerToEliminate(perm_inputs, perm_outputs, false))
+-    {
+-      return;
+-    }
+-
+-    assert(perm_outputs.at(0) == out_index);
+-
+-    VERBOSE(PermutationEliminationPass::EliminateOutput) << "remove NCHW_TO_NHWC permutation\n";
+-
+-    // Update operations' output that is used by permute operand
+-    for (auto perm_input_index : perm_inputs)
+-    {
+-      auto &perm_input_operand = _graph.operands().at(perm_input_index);
+-      perm_input_operand.removeUse(output_def);
+-    }
+-
+-    // set model's new output, which was input of permutation
+-    model_outputs.replace(out_index, perm_inputs.at(0));
+-
+-    // remove model's output, which is also output of permutation
+-    _graph.removeOperand(out_index);
+-
+-    // remove permutation operation
+-    assert(_lowered_graph.op_seqs().containsOperation(output_def));
+-    auto op_seq_idx = _lowered_graph.op_seqs().getOperation(output_def);
+-    _lowered_graph.op_seqs().remove(op_seq_idx);
+-    _graph.operations().remove(output_def);
+-
+-    VERBOSE(PermutationEliminationPass::EliminateOutput)
+-        << out_index.value() << " is model's output and is removed. New output is "
+-        << perm_inputs.at(0).value() << "\n"
+-        << output_def.value() << " is removed permutation operation\n";
+-  }
+-}
+-
+-bool PermutationEliminationPass::isPermuteLayerToEliminate(const OperandIndexSequence &inp_indexes,
+-                                                           const OperandIndexSequence &out_indexes,
+-                                                           bool is_for_model_input)
+-{
+-  auto input_def_factors = _lowered_graph.getLowerInfo(inp_indexes.at(0))->def_factors();
+-  auto output_def_factors = _lowered_graph.getLowerInfo(out_indexes.at(0))->def_factors();
+-
+-  auto input_layout = input_def_factors.getOnlyElement().layout();
+-  auto output_layout = output_def_factors.getOnlyElement().layout();
+-
+-  if (input_def_factors.size() != 1 || output_def_factors.size() != 1)
+-  {
+-    return false;
+-  }
+-
+-  // all operands' factor must be the same
+-  for (auto index : inp_indexes)
+-  {
+-    auto op_factor_set = _lowered_graph.getLowerInfo(index)->def_factors();
+-    if (op_factor_set.size() != 1 ||
+-        input_layout != _lowered_graph.getLowerInfo(index)->def_factors().getOnlyElement().layout())
+-    {
+-      return false;
+-    }
+-  }
+-  // all operands' factor must be the same
+-  for (auto index : out_indexes)
+-  {
+-    auto op_factor_set = _lowered_graph.getLowerInfo(index)->def_factors();
+-    if (op_factor_set.size() != 1 ||
+-        output_layout !=
+-            _lowered_graph.getLowerInfo(index)->def_factors().getOnlyElement().layout())
+-    {
+-      return false;
+-    }
+-  }
+-
+-  if (is_for_model_input)
+-  {
+-    // check if this is NHWC_TO_NCHW permutation: must have single input, which is model's input
+-    return (inp_indexes.size() == 1 && input_layout == Layout::NHWC &&
+-            output_layout == Layout::NCHW);
+-  }
+-
+-  // check if this is NCHW_TO_NHWC permutation: must have single output, which is model's output
+-  return (out_indexes.size() == 1 && input_layout == Layout::NCHW && output_layout == Layout::NHWC);
+-}
+-
+-} // namespace pass
+-} // namespace ir
+-} // namespace onert
+diff --git a/runtime/onert/core/src/ir/pass/PermutationEliminationPass.h b/runtime/onert/core/src/ir/pass/PermutationEliminationPass.h
+deleted file mode 100644
+index 1c84300..0000000
+--- a/runtime/onert/core/src/ir/pass/PermutationEliminationPass.h
++++ /dev/null
+@@ -1,86 +0,0 @@
+-/*
+- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+- *
+- * Licensed under the Apache License, Version 2.0 (the "License");
+- * you may not use this file except in compliance with the License.
+- * You may obtain a copy of the License at
+- *
+- *      http://www.apache.org/licenses/LICENSE-2.0
+- *
+- * Unless required by applicable law or agreed to in writing, software
+- * distributed under the License is distributed on an "AS IS" BASIS,
+- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- * See the License for the specific language governing permissions and
+- * limitations under the License.
+- */
+-
+-#ifndef __ONERT_GRAPH_PASS_PERMUTATION_ELIMINATION_PASS_H__
+-#define __ONERT_GRAPH_PASS_PERMUTATION_ELIMINATION_PASS_H__
+-
+-#include "LoweredOperandPass.h"
+-#include "ir/Operand.h"
+-#include "ir/OperandIndexSequence.h"
+-
+-namespace onert
+-{
+-namespace ir
+-{
+-namespace pass
+-{
+-
+-class PermutationEliminationPass : public LoweredOperandPass
+-{
+-public:
+-  using LoweredOperandPass::LoweredOperandPass;
+-
+-public:
+-  std::string id() override { return "PermutationEliminationPass"; }
+-
+-  void callback(const OperandIndex &index, Operand &object) override;
+-
+-private:
+-  /**
+-   * @brief Remove Permute operation that permutates input
+-   *
+-   * Note: This function aslo removes model's input and
+-   * sets output of permutation as model's new input
+-   *
+-   * @param inp_index is the target operand index for the elimination
+-   * @param object is the target operand object for the elimination
+-   *
+-   * @return
+-   */
+-  void eliminateInput(const OperandIndex &inp_index, Operand &object);
+-
+-  /**
+-   * @brief Remove Permute operation that permutates output of a model
+-   *
+-   * Note: This function aslo removes model's output and
+-   * sets input of permutation as model's new output
+-   *
+-   * @param out_index is the target operand index for the elimination
+-   * @param object is the target operand object for the elimination
+-   *
+-   * @return
+-   */
+-  void eliminateOutput(const OperandIndex &out_index, Operand &object);
+-
+-  /**
+-   * @brief Determine if passed operands are permute layer's input and output, that must be
+-   * eliminated
+-   *
+-   * @param inp_index indexes of the input operand to operation
+-   * @param out_index indexes of the output operand to operation
+-   * @param is_for_model_input checking for model's input or output
+-   *
+-   * @return if it is permutation layer
+-   */
+-  bool isPermuteLayerToEliminate(const OperandIndexSequence &inp_indexes,
+-                                 const OperandIndexSequence &out_indexes, bool is_for_model_input);
+-};
+-
+-} // namespace pass
+-} // namespace ir
+-} // namespace onert
+-
+-#endif // __ONERT_GRAPH_PASS_PERMUTATION_ELIMINATION_PASS_H__
+diff --git a/runtime/onert/core/src/ir/pass/PermutationInsertionPass.cc b/runtime/onert/core/src/ir/pass/PermutationInsertionPass.cc
+index 7c3da52..75efdd8 100644
+--- a/runtime/onert/core/src/ir/pass/PermutationInsertionPass.cc
++++ b/runtime/onert/core/src/ir/pass/PermutationInsertionPass.cc
+@@ -62,27 +62,26 @@ void PermutationInsertionPass::callback(const OperandIndex &index, Operand &obje
+     auto insert_set = operand_li->use_factors() - operand_li->def_factors();
+     auto def_factor = operand_li->def_factors().getOnlyElement();
+ 
+-    auto compatible_backends = [](auto /* backend1 */, auto /* backend2 */) {
+-      // TODO If other issues for Permute elimination are resolved, enable this
+-      return false;
+-      /*
++    auto compatible_backends = [](auto backend1, auto backend2) {
+       // TODO This is a workaround for not inserting Permute between cpu and controlflow.
+       //      To be general, we need another way of checking they are compatible.
+       const auto cf = backend::controlflow::Config::ID;
+       const auto cpu = "cpu";
+       const auto id1 = backend1->config()->id();
+       const auto id2 = backend2->config()->id();
+-      return (id1 == cpu && id2 == cf) // Allows no-Permute for Model inputs
+-          || (id1 == cf && id2 == cpu); // Allows no-Permute for Model outputs
+-          */
++      // NOTE This is to skip Permute insertion for model inputs(controlflow -> cpu), but not
++      // outputs. This function currently assumes that backend1 is Def and backend2 is Use. However
++      // it is going to be fixed soon.
++      // TODO make both ways work
++      return (id1 == cpu && id2 == cf);
+     };
+ 
+     for (auto factor : insert_set)
+     {
++      // Check exceptional cases that Permute ops are not inserted
+       if (factor.layout() == def_factor.layout() &&
+           compatible_backends(factor.backend(), def_factor.backend()))
+       {
+-        // For this factor we can just reuse existing operand - Permute is not added.
+         VERBOSE(PermutationInsertionPass) << "Permutation Insertion is skipped for operand "
+                                           << index << " / as the tensor is compatible with backend "
+                                           << factor.backend()->config()->id() << std::endl;
+diff --git a/runtime/onert/frontend/base_loader/include/base_loader.h b/runtime/onert/frontend/base_loader/include/base_loader.h
+index f5687ad..f763346 100644
+--- a/runtime/onert/frontend/base_loader/include/base_loader.h
++++ b/runtime/onert/frontend/base_loader/include/base_loader.h
+@@ -171,6 +171,8 @@ protected:
+   void loadBroadcastTo(const Operator *op, ir::Graph &subg);
+   void loadFusedBatchNorm(const Operator *op, ir::Graph &subg);
+   void loadLogSoftmax(const Operator *op, ir::Graph &subg);
++  void loadQuantize(const Operator *op, ir::Graph &subg);
++  void loadSpaceToDepth(const Operator *op, ir::Graph &subg);
+ 
+ protected:
+   // Base address for mapped region for loading (if needed)
+@@ -1123,6 +1125,22 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadBroadcastTo(const Operator *o
+   std::unique_ptr<ir::Operation> new_op(new ir::operation::BroadcastTo(inputs, outputs));
+   subg.addOperation(std::move(new_op));
+ }
++template <typename LoaderDomain, typename SpecificLoader>
++void BaseLoader<LoaderDomain, SpecificLoader>::loadSpaceToDepth(const Operator *op, ir::Graph &subg)
++{
++  ir::OperandIndexSequence inputs;
++  ir::OperandIndexSequence outputs;
++  ir::operation::SpaceToDepth::Param param;
++
++  const auto *options = op->builtin_options_as_SpaceToDepthOptions();
++
++  param.block_size = options->block_size();
++
++  loadOperationIO(op, inputs, outputs);
++
++  std::unique_ptr<ir::Operation> new_op(new ir::operation::SpaceToDepth(inputs, outputs, param));
++  subg.addOperation(std::move(new_op));
++}
+ 
+ template <typename LoaderDomain, typename SpecificLoader>
+ void BaseLoader<LoaderDomain, SpecificLoader>::loadCustom(const Operator *op, ir::Graph &subg)
+@@ -1743,6 +1761,18 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadLogSoftmax(const Operator *op
+ }
+ 
+ template <typename LoaderDomain, typename SpecificLoader>
++void BaseLoader<LoaderDomain, SpecificLoader>::loadQuantize(const Operator *op, ir::Graph &subg)
++{
++  ir::OperandIndexSequence inputs;
++  ir::OperandIndexSequence outputs;
++
++  loadOperationIO(op, inputs, outputs);
++
++  std::unique_ptr<ir::Operation> new_op(new ir::operation::Quantize(inputs, outputs));
++  subg.addOperation(std::move(new_op));
++}
++
++template <typename LoaderDomain, typename SpecificLoader>
+ void BaseLoader<LoaderDomain, SpecificLoader>::loadOperation(const Operator *op, ir::Graph &subg)
+ {
+   const auto builtin_op = _model->operator_codes()->Get(op->opcode_index())->builtin_code();
+@@ -1959,6 +1989,12 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadOperation(const Operator *op,
+     case BuiltinOperator::BuiltinOperator_LOG_SOFTMAX:
+       loadLogSoftmax(op, subg);
+       return;
++    case BuiltinOperator::BuiltinOperator_QUANTIZE:
++      loadQuantize(op, subg);
++      return;
++    case BuiltinOperator::BuiltinOperator_SPACE_TO_DEPTH:
++      loadSpaceToDepth(op, subg);
++      return;
+     default:
+       throw std::runtime_error(
+           std::string("Unsupported operation: ").append(EnumNameBuiltinOperator(builtin_op)));
+diff --git a/runtime/onert/frontend/nnapi/wrapper/OperationFactory.cc b/runtime/onert/frontend/nnapi/wrapper/OperationFactory.cc
+index 94791f8..00ffcb6 100644
+--- a/runtime/onert/frontend/nnapi/wrapper/OperationFactory.cc
++++ b/runtime/onert/frontend/nnapi/wrapper/OperationFactory.cc
+@@ -106,6 +106,33 @@ getReduceGenerator(const onert::ir::operation::Reduce::ReduceType reduce_type)
+   };
+ }
+ 
++template <typename T>
++Operation *CreateSimpleUnaryOp(const OperationFactory::Param &init_param, Operands &)
++{
++  assert(init_param.input_count == 1 && init_param.output_count == 1);
++
++  OperandIndexSequence outputs{init_param.outputs[0]};
++
++  // Each input should be interpreted as follows:
++  //
++  //  0 -> Input Tensor Index
++  OperandIndexSequence inputs{init_param.inputs[0]};
++
++  return new T{inputs, outputs};
++}
++
++// A generator function for binary ops with no params
++template <typename T>
++Operation *createSimpleBinaryOp(const OperationFactory::Param &init_param, Operands &)
++{
++  assert(init_param.input_count == 2 && init_param.output_count == 1);
++
++  OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]};
++  OperandIndexSequence outputs{init_param.outputs[0]};
++
++  return new T{inputs, outputs};
++}
++
+ } // namespace
+ 
+ OperationFactory &OperationFactory::get()
+@@ -116,20 +143,10 @@ OperationFactory &OperationFactory::get()
+ 
+ OperationFactory::OperationFactory()
+ {
+-  _map[ANEURALNETWORKS_BATCH_TO_SPACE_ND] = [](const OperationFactory::Param &init_param,
+-                                               Operands &) {
+-    assert(init_param.input_count == 2 && init_param.output_count == 1);
+-
+-    OperandIndexSequence outputs{init_param.outputs[0]};
+-
+-    // Each input should be interpreted as follows:
+-    //
+-    //  0 -> Input Tensor Index
+-    //  1 -> Block size Index
+-    OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]};
+-
+-    return new operation::BatchToSpaceND{inputs, outputs};
+-  };
++  // Each input should be interpreted as follows:
++  //  0 -> Input Tensor Index
++  //  1 -> Block size Index
++  _map[ANEURALNETWORKS_BATCH_TO_SPACE_ND] = createSimpleBinaryOp<operation::BatchToSpaceND>;
+ 
+   _map[ANEURALNETWORKS_DEPTHWISE_CONV_2D] = [](const OperationFactory::Param &init_param,
+                                                Operands &operands) {
+@@ -724,44 +741,11 @@ OperationFactory::OperationFactory()
+     return new operation::Squeeze{inputs, outputs, param};
+   };
+ 
+-  _map[ANEURALNETWORKS_TANH] = [](const OperationFactory::Param &init_param, Operands &) {
+-    assert(init_param.input_count == 1 && init_param.output_count == 1);
+-
+-    OperandIndexSequence outputs{init_param.outputs[0]};
+-
+-    // Each input should be interpreted as follows:
+-    //
+-    //  0 -> Input Tensor Index
+-    OperandIndexSequence inputs{init_param.inputs[0]};
+-
+-    return new operation::Tanh{inputs, outputs};
+-  };
+-
+-  _map[ANEURALNETWORKS_LOG] = [](const OperationFactory::Param &init_param, Operands &) {
+-    assert(init_param.input_count == 1 && init_param.output_count == 1);
++  _map[ANEURALNETWORKS_TANH] = CreateSimpleUnaryOp<operation::Tanh>;
+ 
+-    OperandIndexSequence outputs{init_param.outputs[0]};
++  _map[ANEURALNETWORKS_LOG] = CreateSimpleUnaryOp<operation::Log>;
+ 
+-    // Each input should be interpreted as follows:
+-    //
+-    //  0 -> Input Tensor Index
+-    OperandIndexSequence inputs{init_param.inputs[0]};
+-
+-    return new operation::Log{inputs, outputs};
+-  };
+-
+-  _map[ANEURALNETWORKS_LOGISTIC] = [](const OperationFactory::Param &init_param, Operands &) {
+-    assert(init_param.input_count == 1 && init_param.output_count == 1);
+-
+-    OperandIndexSequence outputs{init_param.outputs[0]};
+-
+-    // Each input should be interpreted as follows:
+-    //
+-    //  0 -> Input Tensor Index
+-    OperandIndexSequence inputs{init_param.inputs[0]};
+-
+-    return new operation::Logistic{inputs, outputs};
+-  };
++  _map[ANEURALNETWORKS_LOGISTIC] = CreateSimpleUnaryOp<operation::Logistic>;
+ 
+   _map[ANEURALNETWORKS_DIV] = [](const OperationFactory::Param &init_param, Operands &operands) {
+     assert(init_param.input_count == 3 && init_param.output_count == 1);
+@@ -784,36 +768,16 @@ OperationFactory::OperationFactory()
+     return new operation::Div{inputs, outputs, param};
+   };
+ 
+-  _map[ANEURALNETWORKS_EXP] = [](const OperationFactory::Param &init_param, Operands &) {
+-    assert(init_param.input_count == 1 && init_param.output_count == 1);
+-
+-    OperandIndexSequence outputs{init_param.outputs[0]};
+-
+-    // Each input should be interpreted as follows:
+-    //
+-    //  0 -> Input Tensor Index
+-    OperandIndexSequence inputs{init_param.inputs[0]};
+-
+-    return new operation::Exp{inputs, outputs};
+-  };
++  _map[ANEURALNETWORKS_EXP] = CreateSimpleUnaryOp<operation::Exp>;
+ 
+   // ANEURALNETWORKS_EXP_EX is deprecated
+   // TODO Remove ANEURALNETWORKS_EXP_EX
+   _map[ANEURALNETWORKS_EXP_EX] = _map[ANEURALNETWORKS_EXP];
+ 
+-  _map[ANEURALNETWORKS_EXPAND_DIMS] = [](const OperationFactory::Param &init_param, Operands &) {
+-    assert(init_param.input_count == 2 && init_param.output_count == 1);
+-
+-    OperandIndexSequence outputs{init_param.outputs[0]};
+-
+-    // Each input should be interpreted as follows:
+-    //
+-    //  0 -> Input Tensor Index
+-    //  1 -> Axis Tensor Index
+-    OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]};
+-
+-    return new operation::ExpandDims{inputs, outputs};
+-  };
++  // Each input should be interpreted as follows:
++  //  0 -> Input Tensor Index
++  //  1 -> Axis Tensor Index
++  _map[ANEURALNETWORKS_EXPAND_DIMS] = createSimpleBinaryOp<operation::ExpandDims>;
+ 
+   _map[ANEURALNETWORKS_GREATER] = [](const OperationFactory::Param &init_param, Operands &) {
+     assert(init_param.input_count == 2 && init_param.output_count == 1);
+@@ -982,19 +946,7 @@ OperationFactory::OperationFactory()
+     return new operation::Comparison{inputs, outputs, param};
+   };
+ 
+-  _map[ANEURALNETWORKS_LOGICAL_AND] = [](const OperationFactory::Param &init_param, Operands &) {
+-    assert(init_param.input_count == 2 && init_param.output_count == 1);
+-
+-    OperandIndexSequence outputs{init_param.outputs[0]};
+-
+-    // Each input should be interpreted as follows:
+-    //
+-    //  0 -> input0 Tensor Index
+-    //  1 -> input1 Tensor Index
+-    OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]};
+-
+-    return new operation::LogicalAnd{inputs, outputs};
+-  };
++  _map[ANEURALNETWORKS_LOGICAL_AND] = createSimpleBinaryOp<operation::LogicalAnd>;
+ 
+   // ANEURALNETWORKS_LOGICAL_AND_EX is deprecated
+   // TODO Remove ANEURALNETWORKS_LOGICAL_AND_EX
+@@ -1018,18 +970,7 @@ OperationFactory::OperationFactory()
+     return new operation::LogicalAnd{inputs, outputs};
+   };
+ 
+-  _map[ANEURALNETWORKS_RSQRT] = [](const OperationFactory::Param &init_param, Operands &) {
+-    assert(init_param.input_count == 1 && init_param.output_count == 1);
+-
+-    OperandIndexSequence outputs{init_param.outputs[0]};
+-
+-    // Each input should be interpreted as follows:
+-    //
+-    //  0 -> Input Tensor Index
+-    OperandIndexSequence inputs{init_param.inputs[0]};
+-
+-    return new operation::RSQRT{inputs, outputs};
+-  };
++  _map[ANEURALNETWORKS_RSQRT] = CreateSimpleUnaryOp<operation::RSQRT>;
+ 
+   _map[ANEURALNETWORKS_SELECT] = [](const OperationFactory::Param &init_param, Operands &) {
+     assert(init_param.input_count == 3 && init_param.output_count == 1);
+@@ -1065,18 +1006,7 @@ OperationFactory::OperationFactory()
+   // TODO Remove ANEURALNETWORKS_RSQRT_EX
+   _map[ANEURALNETWORKS_RSQRT_EX] = _map[ANEURALNETWORKS_RSQRT];
+ 
+-  _map[ANEURALNETWORKS_RELU] = [](const OperationFactory::Param &init_param, Operands &) {
+-    assert(init_param.input_count == 1 && init_param.output_count == 1);
+-
+-    OperandIndexSequence outputs{init_param.outputs[0]};
+-
+-    // Each input should be interpreted as follows:
+-    //
+-    //  0 -> Input Tensor Index
+-    OperandIndexSequence inputs{init_param.inputs[0]};
+-
+-    return new operation::ReLU{inputs, outputs};
+-  };
++  _map[ANEURALNETWORKS_RELU] = CreateSimpleUnaryOp<operation::ReLU>;
+ 
+   _map[ANEURALNETWORKS_RESIZE_BILINEAR] = [](const OperationFactory::Param &init_param,
+                                              Operands &operands) {
+@@ -1098,31 +1028,9 @@ OperationFactory::OperationFactory()
+     return new operation::ResizeBilinear{inputs, outputs, param};
+   };
+ 
+-  _map[ANEURALNETWORKS_RELU1] = [](const OperationFactory::Param &init_param, Operands &) {
+-    assert(init_param.input_count == 1 && init_param.output_count == 1);
+-
+-    OperandIndexSequence outputs{init_param.outputs[0]};
+-
+-    // Each input should be interpreted as follows:
+-    //
+-    //  0 -> input Tensor Index
+-    OperandIndexSequence inputs{init_param.inputs[0]};
++  _map[ANEURALNETWORKS_RELU1] = CreateSimpleUnaryOp<operation::ReLU1>;
+ 
+-    return new operation::ReLU1{inputs, outputs};
+-  };
+-
+-  _map[ANEURALNETWORKS_RELU6] = [](const OperationFactory::Param &init_param, Operands &) {
+-    assert(init_param.input_count == 1 && init_param.output_count == 1);
+-
+-    OperandIndexSequence outputs{init_param.outputs[0]};
+-
+-    // Each input should be interpreted as follows:
+-    //
+-    //  0 -> input Tensor Index
+-    OperandIndexSequence inputs{init_param.inputs[0]};
+-
+-    return new operation::ReLU6{inputs, outputs};
+-  };
++  _map[ANEURALNETWORKS_RELU6] = CreateSimpleUnaryOp<operation::ReLU6>;
+ 
+   _map[ANEURALNETWORKS_REVERSE_EX] = [](const OperationFactory::Param &init_param, Operands &) {
+     assert(init_param.input_count == 2 && init_param.output_count == 1);
+@@ -1438,18 +1346,7 @@ OperationFactory::OperationFactory()
+     return new operation::LogicalOr{inputs, outputs};
+   };
+ 
+-  _map[ANEURALNETWORKS_LOGICAL_NOT] = [](const OperationFactory::Param &init_param, Operands &) {
+-    assert(init_param.input_count == 1 && init_param.output_count == 1);
+-
+-    OperandIndexSequence outputs{init_param.outputs[0]};
+-
+-    // Each input should be interpreted as follows:
+-    //
+-    //  0 -> input Tensor Index
+-    OperandIndexSequence inputs{init_param.inputs[0]};
+-
+-    return new operation::LogicalNot{inputs, outputs};
+-  };
++  _map[ANEURALNETWORKS_LOGICAL_NOT] = CreateSimpleUnaryOp<operation::LogicalNot>;
+ 
+   // ANEURALNETWORKS_LOGICAL_NOT_EX is deprecated
+   // TODO Remove ANEURALNETWORKS_LOGICAL_NOT_EX
+@@ -1649,35 +1546,13 @@ OperationFactory::OperationFactory()
+   // TODO Remove ANEURALNETWORKS_GATHER_EX
+   _map[ANEURALNETWORKS_GATHER_EX] = _map[ANEURALNETWORKS_GATHER];
+ 
+-  _map[ANEURALNETWORKS_NEG] = [](const OperationFactory::Param &init_param, Operands &) {
+-    assert(init_param.input_count == 1 && init_param.output_count == 1);
+-
+-    OperandIndexSequence outputs{init_param.outputs[0]};
+-
+-    // Each input should be interpreted as follows:
+-    //
+-    //  0 -> Input Tensor Index
+-    OperandIndexSequence inputs{init_param.inputs[0]};
+-
+-    return new operation::Neg{inputs, outputs};
+-  };
++  _map[ANEURALNETWORKS_NEG] = CreateSimpleUnaryOp<operation::Neg>;
+ 
+   // ANEURALNETWORKS_NEG_EX is deprecated
+   // TODO Remove ANEURALNETWORKS_NEG_EX
+   _map[ANEURALNETWORKS_NEG_EX] = _map[ANEURALNETWORKS_NEG];
+ 
+-  _map[ANEURALNETWORKS_ABS] = [](const OperationFactory::Param &init_param, Operands &) {
+-    assert(init_param.input_count == 1 && init_param.output_count == 1);
+-
+-    OperandIndexSequence outputs{init_param.outputs[0]};
+-
+-    // Each input should be interpreted as follows:
+-    //
+-    //  0 -> Input Tensor Index
+-    OperandIndexSequence inputs{init_param.inputs[0]};
+-
+-    return new operation::Abs{inputs, outputs};
+-  };
++  _map[ANEURALNETWORKS_ABS] = CreateSimpleUnaryOp<operation::Abs>;
+ 
+   // ANEURALNETWORKS_ABS_EX is deprecated
+   // TODO Remove ANEURALNETWORKS_ABS_EX
+@@ -1704,18 +1579,7 @@ OperationFactory::OperationFactory()
+   // TODO Remove ANEURALNETWORKS_ARGMAX_EX
+   _map[ANEURALNETWORKS_ARGMAX_EX] = _map[ANEURALNETWORKS_ARGMAX];
+ 
+-  _map[ANEURALNETWORKS_DEQUANTIZE] = [](const OperationFactory::Param &init_param, Operands &) {
+-    assert(init_param.input_count == 1 && init_param.output_count == 1);
+-
+-    OperandIndexSequence outputs{init_param.outputs[0]};
+-
+-    // Each input should be interpreted as follows:
+-    //
+-    //  0 -> Input Tensor Index
+-    OperandIndexSequence inputs{init_param.inputs[0]};
+-
+-    return new operation::Dequantize{inputs, outputs};
+-  };
++  _map[ANEURALNETWORKS_DEQUANTIZE] = CreateSimpleUnaryOp<operation::Dequantize>;
+ 
+   _map[ANEURALNETWORKS_MEAN] = [](const OperationFactory::Param &init_param, Operands &operands) {
+     assert(init_param.input_count == 3 && init_param.output_count == 1);
+@@ -1841,31 +1705,24 @@ OperationFactory::OperationFactory()
+   };
+ 
+   _map[ANEURALNETWORKS_PAD] = [](const OperationFactory::Param &init_param, Operands &) {
+-    assert(init_param.input_count == 2 && init_param.output_count >= 1);
++    assert(init_param.input_count >= 2 && init_param.input_count <= 3 &&
++           init_param.output_count >= 1);
+ 
+     OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]};
++    if (init_param.input_count == 3)
++    {
++      inputs.append(OperandIndex{init_param.inputs[2]});
++    }
+     OperandIndexSequence outputs{init_param.outputs[0]};
+ 
+     return new operation::Pad{inputs, outputs};
+   };
+ 
+-  _map[ANEURALNETWORKS_MINIMUM] = [](const OperationFactory::Param &init_param, Operands &) {
+-    assert(init_param.input_count == 2 && init_param.output_count == 1);
++  _map[ANEURALNETWORKS_PAD_V2] = _map[ANEURALNETWORKS_PAD];
+ 
+-    OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]};
+-    OperandIndexSequence outputs{init_param.outputs[0]};
++  _map[ANEURALNETWORKS_MINIMUM] = createSimpleBinaryOp<operation::Min>;
+ 
+-    return new operation::Min{inputs, outputs};
+-  };
+-
+-  _map[ANEURALNETWORKS_MAXIMUM] = [](const OperationFactory::Param &init_param, Operands &) {
+-    assert(init_param.input_count == 2 && init_param.output_count == 1);
+-
+-    OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]};
+-    OperandIndexSequence outputs{init_param.outputs[0]};
+-
+-    return new operation::Max{inputs, outputs};
+-  };
++  _map[ANEURALNETWORKS_MAXIMUM] = createSimpleBinaryOp<operation::Max>;
+ 
+   _map[ANEURALNETWORKS_ONE_HOT_EX] = [](const OperationFactory::Param &init_param,
+                                         Operands &operands) {
+@@ -1948,34 +1805,15 @@ OperationFactory::OperationFactory()
+     return new operation::Range{inputs, outputs};
+   };
+ 
+-  _map[ANEURALNETWORKS_POW] = [](const OperationFactory::Param &init_param, Operands &) {
+-    assert(init_param.input_count == 2 && init_param.output_count == 1);
++  // Each input should be interpreted as follows:
++  //  0 -> LHS Tensor Index
++  //  1 -> RHS Tensor Index
++  _map[ANEURALNETWORKS_POW] = createSimpleBinaryOp<operation::Pow>;
+ 
+-    OperandIndexSequence outputs{init_param.outputs[0]};
+-
+-    // Each input should be interpreted as follows:
+-    //
+-    //  0 -> LHS Tensor Index
+-    //  1 -> RHS Tensor Index
+-
+-    OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]};
+-
+-    return new operation::Pow{inputs, outputs};
+-  };
+-
+-  _map[ANEURALNETWORKS_FILL_EX] = [](const OperationFactory::Param &init_param, Operands &) {
+-    assert(init_param.input_count == 2 && init_param.output_count == 1);
+-
+-    // Each input should be interpreted as follows:
+-    //
+-    //  0 -> A tensor, specifying the input.
+-    //  1 -> A 1-D tensor, specifying the value
+-
+-    OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]};
+-    OperandIndexSequence outputs{init_param.outputs[0]};
+-
+-    return new operation::Fill{inputs, outputs};
+-  };
++  // Each input should be interpreted as follows:
++  //  0 -> A tensor, specifying the input.
++  //  1 -> A 1-D tensor, specifying the value
++  _map[ANEURALNETWORKS_FILL_EX] = createSimpleBinaryOp<operation::Fill>;
+ 
+   _map[ANEURALNETWORKS_ZEROS_LIKE_EX] = [](const OperationFactory::Param &init_param, Operands &) {
+     assert(init_param.input_count == 1 && init_param.output_count == 1);
+@@ -1989,20 +1827,10 @@ OperationFactory::OperationFactory()
+     return new operation::ZerosLike{inputs, outputs};
+   };
+ 
+-  _map[ANEURALNETWORKS_TILE] = [](const OperationFactory::Param &init_param, Operands &) {
+-    assert(init_param.input_count == 2 && init_param.output_count == 1);
+-
+-    OperandIndexSequence outputs{init_param.outputs[0]};
+-
+-    // Each input should be interpreted as follows:
+-    //
+-    //  0 -> Input Tensor Index
+-    //  1 -> Multiple Tensor Index
+-
+-    OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]};
+-
+-    return new operation::Tile{inputs, outputs};
+-  };
++  // Each input should be interpreted as follows:
++  //  0 -> Input Tensor Index
++  //  1 -> Multiple Tensor Index
++  _map[ANEURALNETWORKS_TILE] = createSimpleBinaryOp<operation::Tile>;
+ 
+   _map[ANEURALNETWORKS_MATRIX_BAND_PART_EX] = [](const OperationFactory::Param &init_param,
+                                                  Operands &) {
+@@ -2064,21 +1892,9 @@ OperationFactory::OperationFactory()
+     return new operation::Einsum{inputs, outputs, param};
+   };
+ 
+-  _map[ANEURALNETWORKS_BROADCAST_TO_EX] = [](const OperationFactory::Param &init_param,
+-                                             Operands &) {
+-    assert(init_param.input_count == 2 && init_param.output_count == 1);
+-
+-    OperandIndexSequence outputs{init_param.outputs[0]};
+-
+-    // Each input should be interpreted as follows:
+-    //
+-    //  0 -> Input Tensor Index
+-    //  1 -> int32, int64, An 1-D int tensor Index
+-
+-    OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]};
+-
+-    return new operation::BroadcastTo{inputs, outputs};
+-  };
++  //  0 -> Input Tensor Index
++  //  1 -> int32, int64, An 1-D int tensor Index
++  _map[ANEURALNETWORKS_BROADCAST_TO_EX] = createSimpleBinaryOp<operation::BroadcastTo>;
+ 
+   _map[ANEURALNETWORKS_FUSED_BATCH_NORM_V3_EX] = [](const OperationFactory::Param &init_param,
+                                                     Operands &operands) {
+@@ -2133,6 +1949,15 @@ OperationFactory::OperationFactory()
+ 
+     return new operation::LogSoftmax{inputs, outputs, param};
+   };
++
++  _map[ANEURALNETWORKS_QUANTIZE] = [](const OperationFactory::Param &init_param, Operands &) {
++    assert(init_param.input_count == 1 && init_param.output_count == 1);
++
++    OperandIndexSequence inputs{init_param.inputs[0]};
++    OperandIndexSequence outputs{init_param.outputs[0]};
++
++    return new operation::Quantize{inputs, outputs};
++  };
+ }
+ 
+ Operation *OperationFactory::create(ANeuralNetworksOperationType type,
+diff --git a/runtime/onert/test/core/exec/ExecInstance.cc b/runtime/onert/test/core/exec/ExecInstance.cc
+index cc04347..0fcf372 100644
+--- a/runtime/onert/test/core/exec/ExecInstance.cc
++++ b/runtime/onert/test/core/exec/ExecInstance.cc
+@@ -73,9 +73,8 @@ public:
+     // Compile
+     auto subgs = std::make_shared<onert::ir::Subgraphs>();
+     subgs->push(onert::ir::SubgraphIndex{0}, graph);
+-    auto compiler = new onert::compiler::Compiler{subgs};
+-    executors = compiler->compile();
+-    delete compiler;
++    onert::compiler::Compiler compiler{subgs};
++    executors = compiler.compile();
+   }
+ 
+ public:
+@@ -98,19 +97,17 @@ TEST(ExecInstance, simple)
+   float output_buffer[4] = {};
+   const float output_expected[4] = {5, -2, 0, -1};
+ 
+-  auto execution = new onert::exec::Execution(executors);
++  onert::exec::Execution execution{executors};
+ 
+-  execution->setInput(input1, reinterpret_cast<const void *>(input1_buffer), 16);
+-  execution->setInput(input2, reinterpret_cast<const void *>(input2_buffer), 16);
+-  execution->setOutput(output, reinterpret_cast<void *>(output_buffer), 16);
+-  execution->execute();
++  execution.setInput(input1, reinterpret_cast<const void *>(input1_buffer), 16);
++  execution.setInput(input2, reinterpret_cast<const void *>(input2_buffer), 16);
++  execution.setOutput(output, reinterpret_cast<void *>(output_buffer), 16);
++  execution.execute();
+ 
+   for (auto i = 0; i < 4; i++)
+   {
+     EXPECT_EQ(output_buffer[i], output_expected[i]);
+   }
+-
+-  delete execution;
+ }
+ 
+ TEST(ExecInstance, twoCompile)
+@@ -118,7 +115,7 @@ TEST(ExecInstance, twoCompile)
+   auto mockup = CompiledMockUpModel();
+   auto graph = mockup.graph;
+   auto executors1 = mockup.executors;
+-  auto execution1 = new onert::exec::Execution(executors1);
++  onert::exec::Execution execution1{executors1};
+ 
+   auto input1 = IOIndex{0};
+   auto input2 = IOIndex{1};
+@@ -129,38 +126,34 @@ TEST(ExecInstance, twoCompile)
+   float exe1_output_buffer[4] = {};
+   const float exe1_output_expected[4] = {5, -2, 0, -1};
+ 
+-  execution1->setInput(input1, reinterpret_cast<const void *>(exe1_input1_buffer), 16);
+-  execution1->setInput(input2, reinterpret_cast<const void *>(exe1_input2_buffer), 16);
+-  execution1->setOutput(output, reinterpret_cast<void *>(exe1_output_buffer), 16);
++  execution1.setInput(input1, reinterpret_cast<const void *>(exe1_input1_buffer), 16);
++  execution1.setInput(input2, reinterpret_cast<const void *>(exe1_input2_buffer), 16);
++  execution1.setOutput(output, reinterpret_cast<void *>(exe1_output_buffer), 16);
+ 
+   // Make new executor: compile again
+   auto subgs = std::make_shared<onert::ir::Subgraphs>();
+   subgs->push(onert::ir::SubgraphIndex{0}, graph);
+-  auto compiler = new onert::compiler::Compiler{subgs};
+-  std::shared_ptr<onert::exec::ExecutorMap> executors2 = compiler->compile();
+-  auto execution2 = new onert::exec::Execution(executors2);
++  onert::compiler::Compiler compiler{subgs};
++  std::shared_ptr<onert::exec::ExecutorMap> executors2 = compiler.compile();
++  onert::exec::Execution execution2{executors2};
+ 
+   const float exe2_input1_buffer[4] = {2, 1, -2, 0};
+   const float exe2_input2_buffer[4] = {-3, 3, 1, 2};
+   float exe2_output_buffer[4] = {};
+   const float exe2_output_expected[4] = {2, 5, -2, 7};
+ 
+-  execution2->setInput(input1, reinterpret_cast<const void *>(exe2_input1_buffer), 16);
+-  execution2->setInput(input2, reinterpret_cast<const void *>(exe2_input2_buffer), 16);
+-  execution2->setOutput(output, reinterpret_cast<void *>(exe2_output_buffer), 16);
++  execution2.setInput(input1, reinterpret_cast<const void *>(exe2_input1_buffer), 16);
++  execution2.setInput(input2, reinterpret_cast<const void *>(exe2_input2_buffer), 16);
++  execution2.setOutput(output, reinterpret_cast<void *>(exe2_output_buffer), 16);
+ 
+-  execution1->execute();
+-  execution2->execute();
++  execution1.execute();
++  execution2.execute();
+ 
+   for (auto i = 0; i < 4; i++)
+   {
+     EXPECT_EQ(exe1_output_buffer[i], exe1_output_expected[i]);
+     EXPECT_EQ(exe2_output_buffer[i], exe2_output_expected[i]);
+   }
+-
+-  delete compiler;
+-  delete execution1;
+-  delete execution2;
+ }
+ 
+ // Support two initialized execution instance then ordered execution
+@@ -178,32 +171,29 @@ TEST(ExecInstance, twoExecution)
+   const float exe1_output_expected[4] = {5, -2, 0, -1};
+   const float exe2_output_expected[4] = {2, 5, -2, 7};
+ 
+-  auto execution1 = new onert::exec::Execution(executors);
+-  execution1->setInput(input1, reinterpret_cast<const void *>(exe1_input1_buffer), 16);
+-  execution1->setInput(input2, reinterpret_cast<const void *>(exe1_input2_buffer), 16);
+-  execution1->setOutput(output1, reinterpret_cast<void *>(exe1_output_buffer), 16);
++  onert::exec::Execution execution1{executors};
++  execution1.setInput(input1, reinterpret_cast<const void *>(exe1_input1_buffer), 16);
++  execution1.setInput(input2, reinterpret_cast<const void *>(exe1_input2_buffer), 16);
++  execution1.setOutput(output1, reinterpret_cast<void *>(exe1_output_buffer), 16);
+ 
+   const float exe2_input1_buffer[4] = {2, 1, -2, 0};
+   const float exe2_input2_buffer[4] = {-3, 3, 1, 2};
+   float exe2_output_buffer[4] = {};
+ 
+   // Make new execution
+-  auto execution2 = new onert::exec::Execution(executors);
+-  execution2->setInput(input1, reinterpret_cast<const void *>(exe2_input1_buffer), 16);
+-  execution2->setInput(input2, reinterpret_cast<const void *>(exe2_input2_buffer), 16);
+-  execution2->setOutput(output1, reinterpret_cast<void *>(exe2_output_buffer), 16);
++  onert::exec::Execution execution2{executors};
++  execution2.setInput(input1, reinterpret_cast<const void *>(exe2_input1_buffer), 16);
++  execution2.setInput(input2, reinterpret_cast<const void *>(exe2_input2_buffer), 16);
++  execution2.setOutput(output1, reinterpret_cast<void *>(exe2_output_buffer), 16);
+ 
+-  execution1->execute();
+-  execution2->execute();
++  execution1.execute();
++  execution2.execute();
+ 
+   for (auto i = 0; i < 4; i++)
+   {
+     EXPECT_EQ(exe1_output_buffer[i], exe1_output_expected[i]);
+     EXPECT_EQ(exe2_output_buffer[i], exe2_output_expected[i]);
+   }
+-
+-  delete execution1;
+-  delete execution2;
+ }
+ 
+ class Inference
+@@ -222,14 +212,12 @@ public:
+     auto input2 = IOIndex{1};
+     auto output1 = IOIndex{0};
+ 
+-    auto execution = new onert::exec::Execution(_executors);
+-    execution->setInput(input1, reinterpret_cast<const void *>(_input1), 16);
+-    execution->setInput(input2, reinterpret_cast<const void *>(_input2), 16);
+-    execution->setOutput(output1, reinterpret_cast<void *>(_output), 16);
++    onert::exec::Execution execution{_executors};
++    execution.setInput(input1, reinterpret_cast<const void *>(_input1), 16);
++    execution.setInput(input2, reinterpret_cast<const void *>(_input2), 16);
++    execution.setOutput(output1, reinterpret_cast<void *>(_output), 16);
+ 
+-    execution->execute();
+-
+-    delete execution;
++    execution.execute();
+   }
+ 
+ private:
+@@ -288,20 +276,18 @@ TEST(ExecInstance, async)
+   float output_buffer[4] = {};
+   const float output_expected[4] = {5, -2, 0, -1};
+ 
+-  auto execution = new onert::exec::Execution(executors);
++  onert::exec::Execution execution{executors};
+ 
+-  execution->setInput(input1, reinterpret_cast<const void *>(input1_buffer), 16);
+-  execution->setInput(input2, reinterpret_cast<const void *>(input2_buffer), 16);
+-  execution->setOutput(output, reinterpret_cast<void *>(output_buffer), 16);
+-  execution->startExecute();
+-  execution->waitFinish();
++  execution.setInput(input1, reinterpret_cast<const void *>(input1_buffer), 16);
++  execution.setInput(input2, reinterpret_cast<const void *>(input2_buffer), 16);
++  execution.setOutput(output, reinterpret_cast<void *>(output_buffer), 16);
++  execution.startExecute();
++  execution.waitFinish();
+ 
+   for (auto i = 0; i < 4; i++)
+   {
+     EXPECT_EQ(output_buffer[i], output_expected[i]);
+   }
+-
+-  delete execution;
+ }
+ 
+ } // namespace
+diff --git a/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_cl b/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_cl
+index e50b941..005f61c 100644
+--- a/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_cl
++++ b/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_cl
+@@ -23,8 +23,8 @@ GeneratedTests.cast_float16_to_quant8
+ GeneratedTests.cast_float16_to_quant8_overflow
+ GeneratedTests.cast_float32_to_float16
+ GeneratedTests.cast_float32_to_float16_relaxed
++GeneratedTests.cast_float32_to_int32_nnfw
+ GeneratedTests.cast_int32_to_float16
+-GeneratedTests.cast_int32_to_quant8_overflow
+ GeneratedTests.cast_quant8_to_float16
+ GeneratedTests.concat_dynamic_nnfw
+ GeneratedTests.conv_dynamic_nnfw
+@@ -68,6 +68,7 @@ GeneratedTests.gather_float16_7
+ GeneratedTests.gather_float16_8
+ GeneratedTests.greater_dynamic_float_nnfw
+ GeneratedTests.greater_equal_dynamic_float_nnfw
++GeneratedTests.l2_normalization_quant8_nnfw
+ GeneratedTests.less_dynamic_float_nnfw
+ GeneratedTests.less_equal_dynamic_float_nnfw
+ GeneratedTests.log_4D_float_nnfw
+@@ -106,11 +107,26 @@ GeneratedTests.not_equal_dynamic_float_nnfw
+ GeneratedTests.one_hot_ex_dynamic_nnfw
+ GeneratedTests.pack_ex_dynamic_nnfw
+ GeneratedTests.pad_dynamic_nnfw
++GeneratedTests.pad_v2_1_float
++GeneratedTests.pad_v2_1_quant8
++GeneratedTests.pad_v2_all_dims
++GeneratedTests.pad_v2_all_dims_quant8
++GeneratedTests.pad_v2_low_rank
++GeneratedTests.pad_v2_low_rank_quant8
+ GeneratedTests.pow_2D_float_nnfw
+ GeneratedTests.pow_broadcast_float_nnfw
+ GeneratedTests.pow_broadcast_float_nnfw_2
+ GeneratedTests.pow_broadcast_float_nnfw_3
+ GeneratedTests.pow_dynamic_nnfw
++GeneratedTests.quantize_quant8
++GeneratedTests.quantize_quant8_2
++GeneratedTests.quantize_quant8_3
++GeneratedTests.quantize_quant8_4
++GeneratedTests.quantize_quant8_5
++GeneratedTests.quantize_quant8_6
++GeneratedTests.quantize_quant8_7
++GeneratedTests.quantize_quant8_8
++GeneratedTests.quantize_zero_sized
+ GeneratedTests.range_ex_float_1
+ GeneratedTests.range_ex_float_1_all_constant_inputs
+ GeneratedTests.range_ex_float_1_dynamic_nnfw
+diff --git a/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_neon b/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_neon
+index c9edee5..d987bf1 100644
+--- a/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_neon
++++ b/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_neon
+@@ -23,10 +23,7 @@ GeneratedTests.cast_float16_to_quant8
+ GeneratedTests.cast_float16_to_quant8_overflow
+ GeneratedTests.cast_float32_to_float16
+ GeneratedTests.cast_float32_to_float16_relaxed
+-GeneratedTests.cast_float32_to_quant8_overflow
+-GeneratedTests.cast_float32_to_quant8_overflow_relaxed
+ GeneratedTests.cast_int32_to_float16
+-GeneratedTests.cast_int32_to_quant8_overflow
+ GeneratedTests.cast_quant8_to_float16
+ GeneratedTests.concat_dynamic_nnfw
+ GeneratedTests.conv_dynamic_nnfw
+@@ -73,6 +70,7 @@ GeneratedTests.gather_float16_8
+ GeneratedTests.greater_dynamic_float_nnfw
+ GeneratedTests.greater_equal_boolean
+ GeneratedTests.greater_equal_dynamic_float_nnfw
++GeneratedTests.l2_normalization_quant8_nnfw
+ GeneratedTests.less_boolean
+ GeneratedTests.less_dynamic_float_nnfw
+ GeneratedTests.less_equal_dynamic_float_nnfw
+@@ -112,11 +110,26 @@ GeneratedTests.not_equal_dynamic_float_nnfw
+ GeneratedTests.one_hot_ex_dynamic_nnfw
+ GeneratedTests.pack_ex_dynamic_nnfw
+ GeneratedTests.pad_dynamic_nnfw
++GeneratedTests.pad_v2_1_float
++GeneratedTests.pad_v2_1_quant8
++GeneratedTests.pad_v2_all_dims
++GeneratedTests.pad_v2_all_dims_quant8
++GeneratedTests.pad_v2_low_rank
++GeneratedTests.pad_v2_low_rank_quant8
+ GeneratedTests.pow_2D_float_nnfw
+ GeneratedTests.pow_broadcast_float_nnfw
+ GeneratedTests.pow_broadcast_float_nnfw_2
+ GeneratedTests.pow_broadcast_float_nnfw_3
+ GeneratedTests.pow_dynamic_nnfw
++GeneratedTests.quantize_quant8
++GeneratedTests.quantize_quant8_2
++GeneratedTests.quantize_quant8_3
++GeneratedTests.quantize_quant8_4
++GeneratedTests.quantize_quant8_5
++GeneratedTests.quantize_quant8_6
++GeneratedTests.quantize_quant8_7
++GeneratedTests.quantize_quant8_8
++GeneratedTests.quantize_zero_sized
+ GeneratedTests.range_ex_float_1
+ GeneratedTests.range_ex_float_1_all_constant_inputs
+ GeneratedTests.range_ex_float_1_dynamic_nnfw
+diff --git a/tests/nnapi/nnapi_gtest.skip.aarch64-linux.cpu b/tests/nnapi/nnapi_gtest.skip.aarch64-linux.cpu
+index 3cce4f3..bc0ae0f 100644
+--- a/tests/nnapi/nnapi_gtest.skip.aarch64-linux.cpu
++++ b/tests/nnapi/nnapi_gtest.skip.aarch64-linux.cpu
+@@ -38,9 +38,6 @@ GeneratedTests.gather_float16_8
+ GeneratedTests.hashtable_lookup_float
+ GeneratedTests.hashtable_lookup_float_4D_nnfw
+ GeneratedTests.hashtable_lookup_quant8
+-GeneratedTests.l2_normalization
+-GeneratedTests.l2_normalization_2
+-GeneratedTests.l2_normalization_large
+ GeneratedTests.l2_pool_float
+ GeneratedTests.l2_pool_float_2
+ GeneratedTests.l2_pool_float_large
+@@ -79,7 +76,6 @@ GeneratedTests.minimum_simple_quant8
+ GeneratedTests.neg
+ GeneratedTests.neg_3D_int_nnfw
+ GeneratedTests.neg_4D_int_nnfw
+-GeneratedTests.pad_quant8_nnfw
+ GeneratedTests.prelu
+ GeneratedTests.prelu_broadcast_float_1_nnfw
+ GeneratedTests.prelu_broadcast_quant8_1_nnfw
+@@ -94,6 +90,11 @@ GeneratedTests.prelu_weight_as_input_quant8
+ GeneratedTests.prelu_weight_as_input_quant8_2
+ GeneratedTests.prelu_weight_as_input_quant8_3
+ GeneratedTests.prelu_weight_as_input_quant8_4
++GeneratedTests.quantize_quant8_5
++GeneratedTests.quantize_quant8_6
++GeneratedTests.quantize_quant8_7
++GeneratedTests.quantize_quant8_8
++GeneratedTests.quantize_zero_sized
+ GeneratedTests.reduce_max_quant8
+ GeneratedTests.reduce_max_quant8_1_nnfw
+ GeneratedTests.reduce_max_quant8_2
+@@ -125,13 +126,9 @@ GeneratedTests.select_v1_2_one_dim_quant8
+ GeneratedTests.select_v1_2_two_dim_quant8
+ GeneratedTests.slice_5
+ GeneratedTests.slice_6
+-GeneratedTests.slice_7
+ GeneratedTests.slice_8
+ GeneratedTests.slice_zero_sized
+ GeneratedTests.slice_zero_sized_quant8
+-GeneratedTests.space_to_depth_float_1
+-GeneratedTests.space_to_depth_float_2
+-GeneratedTests.space_to_depth_float_3
+ GeneratedTests.space_to_depth_quant8_1
+ GeneratedTests.space_to_depth_quant8_2
+ GeneratedTests.sqrt_
+diff --git a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_cl b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_cl
+index e50b941..005f61c 100644
+--- a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_cl
++++ b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_cl
+@@ -23,8 +23,8 @@ GeneratedTests.cast_float16_to_quant8
+ GeneratedTests.cast_float16_to_quant8_overflow
+ GeneratedTests.cast_float32_to_float16
+ GeneratedTests.cast_float32_to_float16_relaxed
++GeneratedTests.cast_float32_to_int32_nnfw
+ GeneratedTests.cast_int32_to_float16
+-GeneratedTests.cast_int32_to_quant8_overflow
+ GeneratedTests.cast_quant8_to_float16
+ GeneratedTests.concat_dynamic_nnfw
+ GeneratedTests.conv_dynamic_nnfw
+@@ -68,6 +68,7 @@ GeneratedTests.gather_float16_7
+ GeneratedTests.gather_float16_8
+ GeneratedTests.greater_dynamic_float_nnfw
+ GeneratedTests.greater_equal_dynamic_float_nnfw
++GeneratedTests.l2_normalization_quant8_nnfw
+ GeneratedTests.less_dynamic_float_nnfw
+ GeneratedTests.less_equal_dynamic_float_nnfw
+ GeneratedTests.log_4D_float_nnfw
+@@ -106,11 +107,26 @@ GeneratedTests.not_equal_dynamic_float_nnfw
+ GeneratedTests.one_hot_ex_dynamic_nnfw
+ GeneratedTests.pack_ex_dynamic_nnfw
+ GeneratedTests.pad_dynamic_nnfw
++GeneratedTests.pad_v2_1_float
++GeneratedTests.pad_v2_1_quant8
++GeneratedTests.pad_v2_all_dims
++GeneratedTests.pad_v2_all_dims_quant8
++GeneratedTests.pad_v2_low_rank
++GeneratedTests.pad_v2_low_rank_quant8
+ GeneratedTests.pow_2D_float_nnfw
+ GeneratedTests.pow_broadcast_float_nnfw
+ GeneratedTests.pow_broadcast_float_nnfw_2
+ GeneratedTests.pow_broadcast_float_nnfw_3
+ GeneratedTests.pow_dynamic_nnfw
++GeneratedTests.quantize_quant8
++GeneratedTests.quantize_quant8_2
++GeneratedTests.quantize_quant8_3
++GeneratedTests.quantize_quant8_4
++GeneratedTests.quantize_quant8_5
++GeneratedTests.quantize_quant8_6
++GeneratedTests.quantize_quant8_7
++GeneratedTests.quantize_quant8_8
++GeneratedTests.quantize_zero_sized
+ GeneratedTests.range_ex_float_1
+ GeneratedTests.range_ex_float_1_all_constant_inputs
+ GeneratedTests.range_ex_float_1_dynamic_nnfw
+diff --git a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon
+index 55cfe39..051fbc7 100644
+--- a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon
++++ b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon
+@@ -23,10 +23,7 @@ GeneratedTests.cast_float16_to_quant8
+ GeneratedTests.cast_float16_to_quant8_overflow
+ GeneratedTests.cast_float32_to_float16
+ GeneratedTests.cast_float32_to_float16_relaxed
+-GeneratedTests.cast_float32_to_quant8_overflow
+-GeneratedTests.cast_float32_to_quant8_overflow_relaxed
+ GeneratedTests.cast_int32_to_float16
+-GeneratedTests.cast_int32_to_quant8_overflow
+ GeneratedTests.cast_quant8_to_float16
+ GeneratedTests.concat_dynamic_nnfw
+ GeneratedTests.conv_dynamic_nnfw
+@@ -73,6 +70,7 @@ GeneratedTests.greater_dynamic_float_nnfw
+ GeneratedTests.greater_equal_boolean
+ GeneratedTests.greater_equal_dynamic_float_nnfw
+ GeneratedTests.less_boolean
++GeneratedTests.l2_normalization_quant8_nnfw
+ GeneratedTests.less_dynamic_float_nnfw
+ GeneratedTests.less_equal_dynamic_float_nnfw
+ GeneratedTests.log_4D_float_nnfw
+@@ -111,11 +109,26 @@ GeneratedTests.not_equal_dynamic_float_nnfw
+ GeneratedTests.one_hot_ex_dynamic_nnfw
+ GeneratedTests.pack_ex_dynamic_nnfw
+ GeneratedTests.pad_dynamic_nnfw
++GeneratedTests.pad_v2_1_float
++GeneratedTests.pad_v2_1_quant8
++GeneratedTests.pad_v2_all_dims
++GeneratedTests.pad_v2_all_dims_quant8
++GeneratedTests.pad_v2_low_rank
++GeneratedTests.pad_v2_low_rank_quant8
+ GeneratedTests.pow_2D_float_nnfw
+ GeneratedTests.pow_broadcast_float_nnfw
+ GeneratedTests.pow_broadcast_float_nnfw_2
+ GeneratedTests.pow_broadcast_float_nnfw_3
+ GeneratedTests.pow_dynamic_nnfw
++GeneratedTests.quantize_quant8
++GeneratedTests.quantize_quant8_2
++GeneratedTests.quantize_quant8_3
++GeneratedTests.quantize_quant8_4
++GeneratedTests.quantize_quant8_5
++GeneratedTests.quantize_quant8_6
++GeneratedTests.quantize_quant8_7
++GeneratedTests.quantize_quant8_8
++GeneratedTests.quantize_zero_sized
+ GeneratedTests.range_ex_float_1
+ GeneratedTests.range_ex_float_1_all_constant_inputs
+ GeneratedTests.range_ex_float_1_dynamic_nnfw
+diff --git a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.cpu b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.cpu
+index 3cce4f3..bc0ae0f 100644
+--- a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.cpu
++++ b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.cpu
+@@ -38,9 +38,6 @@ GeneratedTests.gather_float16_8
+ GeneratedTests.hashtable_lookup_float
+ GeneratedTests.hashtable_lookup_float_4D_nnfw
+ GeneratedTests.hashtable_lookup_quant8
+-GeneratedTests.l2_normalization
+-GeneratedTests.l2_normalization_2
+-GeneratedTests.l2_normalization_large
+ GeneratedTests.l2_pool_float
+ GeneratedTests.l2_pool_float_2
+ GeneratedTests.l2_pool_float_large
+@@ -79,7 +76,6 @@ GeneratedTests.minimum_simple_quant8
+ GeneratedTests.neg
+ GeneratedTests.neg_3D_int_nnfw
+ GeneratedTests.neg_4D_int_nnfw
+-GeneratedTests.pad_quant8_nnfw
+ GeneratedTests.prelu
+ GeneratedTests.prelu_broadcast_float_1_nnfw
+ GeneratedTests.prelu_broadcast_quant8_1_nnfw
+@@ -94,6 +90,11 @@ GeneratedTests.prelu_weight_as_input_quant8
+ GeneratedTests.prelu_weight_as_input_quant8_2
+ GeneratedTests.prelu_weight_as_input_quant8_3
+ GeneratedTests.prelu_weight_as_input_quant8_4
++GeneratedTests.quantize_quant8_5
++GeneratedTests.quantize_quant8_6
++GeneratedTests.quantize_quant8_7
++GeneratedTests.quantize_quant8_8
++GeneratedTests.quantize_zero_sized
+ GeneratedTests.reduce_max_quant8
+ GeneratedTests.reduce_max_quant8_1_nnfw
+ GeneratedTests.reduce_max_quant8_2
+@@ -125,13 +126,9 @@ GeneratedTests.select_v1_2_one_dim_quant8
+ GeneratedTests.select_v1_2_two_dim_quant8
+ GeneratedTests.slice_5
+ GeneratedTests.slice_6
+-GeneratedTests.slice_7
+ GeneratedTests.slice_8
+ GeneratedTests.slice_zero_sized
+ GeneratedTests.slice_zero_sized_quant8
+-GeneratedTests.space_to_depth_float_1
+-GeneratedTests.space_to_depth_float_2
+-GeneratedTests.space_to_depth_float_3
+ GeneratedTests.space_to_depth_quant8_1
+ GeneratedTests.space_to_depth_quant8_2
+ GeneratedTests.sqrt_
+diff --git a/tests/nnapi/nnapi_gtest.skip.noarch.interp b/tests/nnapi/nnapi_gtest.skip.noarch.interp
+index 08118ca..069d367 100644
+--- a/tests/nnapi/nnapi_gtest.skip.noarch.interp
++++ b/tests/nnapi/nnapi_gtest.skip.noarch.interp
+@@ -188,6 +188,7 @@ GeneratedTests.hashtable_lookup_quant8
+ GeneratedTests.l2_normalization
+ GeneratedTests.l2_normalization_2
+ GeneratedTests.l2_normalization_large
++GeneratedTests.l2_normalization_quant8_nnfw
+ GeneratedTests.l2_pool_float
+ GeneratedTests.l2_pool_float_2
+ GeneratedTests.l2_pool_float_large
+@@ -312,6 +313,12 @@ GeneratedTests.pack_ex_2D_int_2
+ GeneratedTests.pack_ex_dynamic_nnfw
+ GeneratedTests.pad_dynamic_nnfw
+ GeneratedTests.pad_quant8_nnfw
++GeneratedTests.pad_v2_1_float
++GeneratedTests.pad_v2_1_quant8
++GeneratedTests.pad_v2_all_dims
++GeneratedTests.pad_v2_all_dims_quant8
++GeneratedTests.pad_v2_low_rank
++GeneratedTests.pad_v2_low_rank_quant8
+ GeneratedTests.pow_2D_float_nnfw
+ GeneratedTests.pow_broadcast_float_nnfw
+ GeneratedTests.pow_broadcast_float_nnfw_2
+@@ -331,6 +338,15 @@ GeneratedTests.prelu_weight_as_input_quant8
+ GeneratedTests.prelu_weight_as_input_quant8_2
+ GeneratedTests.prelu_weight_as_input_quant8_3
+ GeneratedTests.prelu_weight_as_input_quant8_4
++GeneratedTests.quantize_quant8
++GeneratedTests.quantize_quant8_2
++GeneratedTests.quantize_quant8_3
++GeneratedTests.quantize_quant8_4
++GeneratedTests.quantize_quant8_5
++GeneratedTests.quantize_quant8_6
++GeneratedTests.quantize_quant8_7
++GeneratedTests.quantize_quant8_8
++GeneratedTests.quantize_zero_sized
+ GeneratedTests.range_ex_float_1
+ GeneratedTests.range_ex_float_1_all_constant_inputs
+ GeneratedTests.range_ex_float_1_dynamic_nnfw
+diff --git a/tests/nnapi/nnapi_gtest.skip.x86_64-linux.cpu b/tests/nnapi/nnapi_gtest.skip.x86_64-linux.cpu
+index 3cce4f3..bc0ae0f 100644
+--- a/tests/nnapi/nnapi_gtest.skip.x86_64-linux.cpu
++++ b/tests/nnapi/nnapi_gtest.skip.x86_64-linux.cpu
+@@ -38,9 +38,6 @@ GeneratedTests.gather_float16_8
+ GeneratedTests.hashtable_lookup_float
+ GeneratedTests.hashtable_lookup_float_4D_nnfw
+ GeneratedTests.hashtable_lookup_quant8
+-GeneratedTests.l2_normalization
+-GeneratedTests.l2_normalization_2
+-GeneratedTests.l2_normalization_large
+ GeneratedTests.l2_pool_float
+ GeneratedTests.l2_pool_float_2
+ GeneratedTests.l2_pool_float_large
+@@ -79,7 +76,6 @@ GeneratedTests.minimum_simple_quant8
+ GeneratedTests.neg
+ GeneratedTests.neg_3D_int_nnfw
+ GeneratedTests.neg_4D_int_nnfw
+-GeneratedTests.pad_quant8_nnfw
+ GeneratedTests.prelu
+ GeneratedTests.prelu_broadcast_float_1_nnfw
+ GeneratedTests.prelu_broadcast_quant8_1_nnfw
+@@ -94,6 +90,11 @@ GeneratedTests.prelu_weight_as_input_quant8
+ GeneratedTests.prelu_weight_as_input_quant8_2
+ GeneratedTests.prelu_weight_as_input_quant8_3
+ GeneratedTests.prelu_weight_as_input_quant8_4
++GeneratedTests.quantize_quant8_5
++GeneratedTests.quantize_quant8_6
++GeneratedTests.quantize_quant8_7
++GeneratedTests.quantize_quant8_8
++GeneratedTests.quantize_zero_sized
+ GeneratedTests.reduce_max_quant8
+ GeneratedTests.reduce_max_quant8_1_nnfw
+ GeneratedTests.reduce_max_quant8_2
+@@ -125,13 +126,9 @@ GeneratedTests.select_v1_2_one_dim_quant8
+ GeneratedTests.select_v1_2_two_dim_quant8
+ GeneratedTests.slice_5
+ GeneratedTests.slice_6
+-GeneratedTests.slice_7
+ GeneratedTests.slice_8
+ GeneratedTests.slice_zero_sized
+ GeneratedTests.slice_zero_sized_quant8
+-GeneratedTests.space_to_depth_float_1
+-GeneratedTests.space_to_depth_float_2
+-GeneratedTests.space_to_depth_float_3
+ GeneratedTests.space_to_depth_quant8_1
+ GeneratedTests.space_to_depth_quant8_2
+ GeneratedTests.sqrt_
+diff --git a/tests/nnapi/specs/V1_0/l2_normalization_quant8_nnfw.mod.py b/tests/nnapi/specs/V1_0/l2_normalization_quant8_nnfw.mod.py
+new file mode 100644
+index 0000000..ca3770c
+--- /dev/null
++++ b/tests/nnapi/specs/V1_0/l2_normalization_quant8_nnfw.mod.py
+@@ -0,0 +1,30 @@
++#
++# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
++# Copyright (C) 2017 The Android Open Source Project
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#      http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++#
++
++model = Model()
++in0 = Input("op1", "TENSOR_QUANT8_ASYMM", "{1, 1, 1, 3}, 2e-7, 128")
++out0 = Output("op2", "TENSOR_QUANT8_ASYMM", "{1, 1, 1, 3}, 2e-7, 128")
++model = model.Operation("L2_NORMALIZATION", in0).To(out0)
++
++# Example 1. Input in operand 0,
++input0 = {in0: # input 0
++          [0, 5, 12]}
++output0 = {out0: # output 0
++               [51, 54, 58]}
++
++# Instantiate an example
++Example((input0, output0))
+diff --git a/tests/nnapi/specs/V1_2/pad_v2_1_float.mod.py b/tests/nnapi/specs/V1_2/pad_v2_1_float.mod.py
+new file mode 100644
+index 0000000..c500741
+--- /dev/null
++++ b/tests/nnapi/specs/V1_2/pad_v2_1_float.mod.py
+@@ -0,0 +1,35 @@
++#
++# Copyright (C) 2018 The Android Open Source Project
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#      http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++#
++
++input0 = Input("input0", "TENSOR_FLOAT32", "{1, 2, 3, 1}")
++paddings = Parameter("paddings", "TENSOR_INT32", "{4, 2}", [0, 0,
++                                                            0, 2,
++                                                            1, 3,
++                                                            0, 0])
++pad_value = Float32Scalar("pad_value", 9.3)
++output0 = Output("output0", "TENSOR_FLOAT32", "{1, 4, 7, 1}")
++
++model = Model().Operation("PAD_V2", input0, paddings, pad_value).To(output0)
++
++Example(({
++    input0: [1.0, 2.0, 3.0,
++             4.0, 5.0, 6.0],
++}, {
++    output0: [9.3, 1.0, 2.0, 3.0, 9.3, 9.3, 9.3,
++              9.3, 4.0, 5.0, 6.0, 9.3, 9.3, 9.3,
++              9.3, 9.3, 9.3, 9.3, 9.3, 9.3, 9.3,
++              9.3, 9.3, 9.3, 9.3, 9.3, 9.3, 9.3],
++})).AddVariations("float16", "relaxed")
+diff --git a/tests/nnapi/specs/V1_2/pad_v2_1_quant8.mod.py b/tests/nnapi/specs/V1_2/pad_v2_1_quant8.mod.py
+new file mode 100644
+index 0000000..3dfaff6
+--- /dev/null
++++ b/tests/nnapi/specs/V1_2/pad_v2_1_quant8.mod.py
+@@ -0,0 +1,35 @@
++#
++# Copyright (C) 2018 The Android Open Source Project
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#      http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++#
++
++input0 = Input("input0", "TENSOR_QUANT8_ASYMM", "{1, 2, 3, 1}, 2.3, 4")
++paddings = Parameter("paddings", "TENSOR_INT32", "{4, 2}", [0, 0,
++                                                            0, 2,
++                                                            1, 3,
++                                                            0, 0])
++pad_value = Int32Scalar("pad_value", 9)
++output0 = Output("output0", "TENSOR_QUANT8_ASYMM", "{1, 4, 7, 1}, 2.3, 4")
++
++model = Model().Operation("PAD_V2", input0, paddings, pad_value).To(output0)
++
++Example(({
++    input0: [1, 2, 3,
++             4, 5, 6],
++}, {
++    output0: [9, 1, 2, 3, 9, 9, 9,
++              9, 4, 5, 6, 9, 9, 9,
++              9, 9, 9, 9, 9, 9, 9,
++              9, 9, 9, 9, 9, 9, 9],
++}))
+diff --git a/tests/nnapi/specs/V1_2/pad_v2_all_dims.mod.py b/tests/nnapi/specs/V1_2/pad_v2_all_dims.mod.py
+new file mode 100644
+index 0000000..5b27f49
+--- /dev/null
++++ b/tests/nnapi/specs/V1_2/pad_v2_all_dims.mod.py
+@@ -0,0 +1,40 @@
++#
++# Copyright (C) 2019 The Android Open Source Project
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#      http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++#
++
++import numpy as np
++
++input0 = Input("input0", "TENSOR_FLOAT32", "{1, 1, 2, 3}")
++paddings = Parameter("paddings", "TENSOR_INT32", "{4, 2}", [1, 2,
++                                                            3, 4,
++                                                            3, 3,
++                                                            2, 1])
++pad_value = Float32Scalar("pad_value", 3.9)
++output0 = Output("output0", "TENSOR_FLOAT32", "{4, 8, 8, 6}")
++
++model = Model().Operation("PAD_V2", input0, paddings, pad_value).To(output0)
++
++Example({
++    input0: [1.0, 2.0, 3.0,
++             4.0, 5.0, 6.0],
++    output0: np.pad([[[[1.0, 2.0, 3.0],
++                       [4.0, 5.0, 6.0]]]],
++                    [[1, 2],
++                     [3, 4],
++                     [3, 3],
++                     [2, 1]],
++                    "constant",
++                    constant_values=3.9).flatten().tolist(),
++}).AddVariations("float16", "relaxed")
+diff --git a/tests/nnapi/specs/V1_2/pad_v2_all_dims_quant8.mod.py b/tests/nnapi/specs/V1_2/pad_v2_all_dims_quant8.mod.py
+new file mode 100644
+index 0000000..5ee4b06
+--- /dev/null
++++ b/tests/nnapi/specs/V1_2/pad_v2_all_dims_quant8.mod.py
+@@ -0,0 +1,40 @@
++#
++# Copyright (C) 2019 The Android Open Source Project
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#      http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++#
++
++import numpy
++
++input0 = Input("input0", "TENSOR_QUANT8_ASYMM", "{1, 1, 2, 3}, 2.3, 4")
++paddings = Parameter("paddings", "TENSOR_INT32", "{4, 2}", [1, 2,
++                                                            3, 4,
++                                                            3, 3,
++                                                            2, 1])
++pad_value = Int32Scalar("pad_value", 3)
++output0 = Output("output0", "TENSOR_QUANT8_ASYMM", "{4, 8, 8, 6}, 2.3, 4")
++
++model = Model().Operation("PAD_V2", input0, paddings, pad_value).To(output0)
++
++Example({
++    input0: [1, 2, 3,
++             4, 5, 6],
++    output0: np.pad([[[[1, 2, 3],
++                       [4, 5, 6]]]],
++                    [[1, 2],
++                     [3, 4],
++                     [3, 3],
++                     [2, 1]],
++                    "constant",
++                    constant_values=3).flatten().tolist(),
++})
+diff --git a/tests/nnapi/specs/V1_2/pad_v2_low_rank.mod.py b/tests/nnapi/specs/V1_2/pad_v2_low_rank.mod.py
+new file mode 100644
+index 0000000..391d5cf
+--- /dev/null
++++ b/tests/nnapi/specs/V1_2/pad_v2_low_rank.mod.py
+@@ -0,0 +1,27 @@
++#
++# Copyright (C) 2019 The Android Open Source Project
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#      http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++#
++
++input0 = Input("input0", "TENSOR_FLOAT32", "{3}")
++paddings = Parameter("paddings", "TENSOR_INT32", "{1, 2}", [3, 1])
++pad_value = Float32Scalar("pad_value", 9.9)
++output0 = Output("output0", "TENSOR_FLOAT32", "{7}")
++
++model = Model().Operation("PAD_V2", input0, paddings, pad_value).To(output0)
++
++Example({
++    input0: [1.0, 2.0, 3.0],
++    output0: [9.9, 9.9, 9.9, 1.0, 2.0, 3.0, 9.9],
++}).AddVariations("float16")
+diff --git a/tests/nnapi/specs/V1_2/pad_v2_low_rank_quant8.mod.py b/tests/nnapi/specs/V1_2/pad_v2_low_rank_quant8.mod.py
+new file mode 100644
+index 0000000..b67c2b8
+--- /dev/null
++++ b/tests/nnapi/specs/V1_2/pad_v2_low_rank_quant8.mod.py
+@@ -0,0 +1,27 @@
++#
++# Copyright (C) 2019 The Android Open Source Project
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#      http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++#
++
++input0 = Input("input0", "TENSOR_QUANT8_ASYMM", "{3}, 2.3, 4")
++paddings = Parameter("paddings", "TENSOR_INT32", "{1, 2}", [3, 1])
++pad_value = Int32Scalar("pad_value", 9)
++output0 = Output("output0", "TENSOR_QUANT8_ASYMM", "{7}, 2.3, 4")
++
++model = Model().Operation("PAD_V2", input0, paddings, pad_value).To(output0)
++
++Example({
++    input0: [1, 2, 3],
++    output0: [9, 9, 9, 1, 2, 3, 9],
++})
+diff --git a/tests/nnapi/specs/V1_2/quantize.mod.py b/tests/nnapi/specs/V1_2/quantize.mod.py
+new file mode 100644
+index 0000000..a42624d
+--- /dev/null
++++ b/tests/nnapi/specs/V1_2/quantize.mod.py
+@@ -0,0 +1,69 @@
++#
++# Copyright (C) 2018 The Android Open Source Project
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#      http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++#
++
++import numpy as np
++
++num_values = 300
++values = list(np.linspace(-10, 10, num_values))
++
++for input_type in ["TENSOR_FLOAT32", "TENSOR_FLOAT16"]:
++  for scale, offset in [(1.0, 0),
++                        (1.0, 1),
++                        (0.01, 120),
++                        (10.0, 120)]:
++    input0 = Input("input0", input_type, "{%d}" % num_values)
++    output0 = Output("output0", input_type, "{%d}" % num_values)
++
++    model = Model().Operation("QUANTIZE", input0).To(output0)
++
++    quantizeOutput = DataTypeConverter().Identify({
++        output0: ["TENSOR_QUANT8_ASYMM", scale, offset],
++    })
++
++    Example({
++        input0: values,
++        output0: values,
++    }).AddVariations(quantizeOutput, includeDefault=False)
++
++
++# Zero-sized input
++
++# Use BOX_WITH_NMS_LIMIT op to generate a zero-sized internal tensor for box cooridnates.
++p1 = Parameter("scores", "TENSOR_FLOAT32", "{1, 2}", [0.90, 0.10]) # scores
++p2 = Parameter("roi", "TENSOR_FLOAT32", "{1, 8}", [1, 1, 10, 10, 0, 0, 10, 10]) # roi
++o1 = Output("scoresOut", "TENSOR_FLOAT32", "{0}") # scores out
++o2 = Output("classesOut", "TENSOR_INT32", "{0}") # classes out
++tmp1 = Internal("roiOut", "TENSOR_FLOAT32", "{0, 4}") # roi out
++tmp2 = Internal("batchSplitOut", "TENSOR_INT32", "{0}") # batch split out
++model = Model("zero_sized").Operation("BOX_WITH_NMS_LIMIT", p1, p2, [0], 0.3,  -1, 0, 0.4, 1.0, 0.3).To(o1, tmp1, o2, tmp2)
++
++# Use ROI_ALIGN op to convert into zero-sized feature map.
++layout = BoolScalar("layout", False) # NHWC
++i1 = Input("in", "TENSOR_FLOAT32", "{1, 1, 1, 1}")
++zero_sized = Internal("featureMap", "TENSOR_FLOAT32", "{0, 2, 2, 1}")
++model = model.Operation("ROI_ALIGN", i1, tmp1, tmp2, 2, 2, 2.0, 2.0, 4, 4, layout).To(zero_sized)
++
++# QUANTIZE op with numBatches = 0.
++o3 = Output("out", "TENSOR_QUANT8_ASYMM", "{0, 2, 2, 1}, 0.1f, 128") # out
++model = model.Operation("QUANTIZE", zero_sized).To(o3)
++
++# Create test case with dummy values.
++Example({
++    i1: [1],
++    o1: [0],
++    o2: [0],
++    o3: [0],
++}).AddVariations("relaxed", "float16")
+diff --git a/tests/nnapi/specs/skip/V1_2/pad_v2_1_float.mod.py b/tests/nnapi/specs/skip/V1_2/pad_v2_1_float.mod.py
+deleted file mode 100644
+index c500741..0000000
+--- a/tests/nnapi/specs/skip/V1_2/pad_v2_1_float.mod.py
++++ /dev/null
+@@ -1,35 +0,0 @@
+-#
+-# Copyright (C) 2018 The Android Open Source Project
+-#
+-# Licensed under the Apache License, Version 2.0 (the "License");
+-# you may not use this file except in compliance with the License.
+-# You may obtain a copy of the License at
+-#
+-#      http://www.apache.org/licenses/LICENSE-2.0
+-#
+-# Unless required by applicable law or agreed to in writing, software
+-# distributed under the License is distributed on an "AS IS" BASIS,
+-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+-# See the License for the specific language governing permissions and
+-# limitations under the License.
+-#
+-
+-input0 = Input("input0", "TENSOR_FLOAT32", "{1, 2, 3, 1}")
+-paddings = Parameter("paddings", "TENSOR_INT32", "{4, 2}", [0, 0,
+-                                                            0, 2,
+-                                                            1, 3,
+-                                                            0, 0])
+-pad_value = Float32Scalar("pad_value", 9.3)
+-output0 = Output("output0", "TENSOR_FLOAT32", "{1, 4, 7, 1}")
+-
+-model = Model().Operation("PAD_V2", input0, paddings, pad_value).To(output0)
+-
+-Example(({
+-    input0: [1.0, 2.0, 3.0,
+-             4.0, 5.0, 6.0],
+-}, {
+-    output0: [9.3, 1.0, 2.0, 3.0, 9.3, 9.3, 9.3,
+-              9.3, 4.0, 5.0, 6.0, 9.3, 9.3, 9.3,
+-              9.3, 9.3, 9.3, 9.3, 9.3, 9.3, 9.3,
+-              9.3, 9.3, 9.3, 9.3, 9.3, 9.3, 9.3],
+-})).AddVariations("float16", "relaxed")
+diff --git a/tests/nnapi/specs/skip/V1_2/pad_v2_1_quant8.mod.py b/tests/nnapi/specs/skip/V1_2/pad_v2_1_quant8.mod.py
+deleted file mode 100644
+index 3dfaff6..0000000
+--- a/tests/nnapi/specs/skip/V1_2/pad_v2_1_quant8.mod.py
++++ /dev/null
+@@ -1,35 +0,0 @@
+-#
+-# Copyright (C) 2018 The Android Open Source Project
+-#
+-# Licensed under the Apache License, Version 2.0 (the "License");
+-# you may not use this file except in compliance with the License.
+-# You may obtain a copy of the License at
+-#
+-#      http://www.apache.org/licenses/LICENSE-2.0
+-#
+-# Unless required by applicable law or agreed to in writing, software
+-# distributed under the License is distributed on an "AS IS" BASIS,
+-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+-# See the License for the specific language governing permissions and
+-# limitations under the License.
+-#
+-
+-input0 = Input("input0", "TENSOR_QUANT8_ASYMM", "{1, 2, 3, 1}, 2.3, 4")
+-paddings = Parameter("paddings", "TENSOR_INT32", "{4, 2}", [0, 0,
+-                                                            0, 2,
+-                                                            1, 3,
+-                                                            0, 0])
+-pad_value = Int32Scalar("pad_value", 9)
+-output0 = Output("output0", "TENSOR_QUANT8_ASYMM", "{1, 4, 7, 1}, 2.3, 4")
+-
+-model = Model().Operation("PAD_V2", input0, paddings, pad_value).To(output0)
+-
+-Example(({
+-    input0: [1, 2, 3,
+-             4, 5, 6],
+-}, {
+-    output0: [9, 1, 2, 3, 9, 9, 9,
+-              9, 4, 5, 6, 9, 9, 9,
+-              9, 9, 9, 9, 9, 9, 9,
+-              9, 9, 9, 9, 9, 9, 9],
+-}))
+diff --git a/tests/nnapi/specs/skip/V1_2/pad_v2_all_dims.mod.py b/tests/nnapi/specs/skip/V1_2/pad_v2_all_dims.mod.py
+deleted file mode 100644
+index 5b27f49..0000000
+--- a/tests/nnapi/specs/skip/V1_2/pad_v2_all_dims.mod.py
++++ /dev/null
+@@ -1,40 +0,0 @@
+-#
+-# Copyright (C) 2019 The Android Open Source Project
+-#
+-# Licensed under the Apache License, Version 2.0 (the "License");
+-# you may not use this file except in compliance with the License.
+-# You may obtain a copy of the License at
+-#
+-#      http://www.apache.org/licenses/LICENSE-2.0
+-#
+-# Unless required by applicable law or agreed to in writing, software
+-# distributed under the License is distributed on an "AS IS" BASIS,
+-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+-# See the License for the specific language governing permissions and
+-# limitations under the License.
+-#
+-
+-import numpy as np
+-
+-input0 = Input("input0", "TENSOR_FLOAT32", "{1, 1, 2, 3}")
+-paddings = Parameter("paddings", "TENSOR_INT32", "{4, 2}", [1, 2,
+-                                                            3, 4,
+-                                                            3, 3,
+-                                                            2, 1])
+-pad_value = Float32Scalar("pad_value", 3.9)
+-output0 = Output("output0", "TENSOR_FLOAT32", "{4, 8, 8, 6}")
+-
+-model = Model().Operation("PAD_V2", input0, paddings, pad_value).To(output0)
+-
+-Example({
+-    input0: [1.0, 2.0, 3.0,
+-             4.0, 5.0, 6.0],
+-    output0: np.pad([[[[1.0, 2.0, 3.0],
+-                       [4.0, 5.0, 6.0]]]],
+-                    [[1, 2],
+-                     [3, 4],
+-                     [3, 3],
+-                     [2, 1]],
+-                    "constant",
+-                    constant_values=3.9).flatten().tolist(),
+-}).AddVariations("float16", "relaxed")
+diff --git a/tests/nnapi/specs/skip/V1_2/pad_v2_all_dims_quant8.mod.py b/tests/nnapi/specs/skip/V1_2/pad_v2_all_dims_quant8.mod.py
+deleted file mode 100644
+index 5ee4b06..0000000
+--- a/tests/nnapi/specs/skip/V1_2/pad_v2_all_dims_quant8.mod.py
++++ /dev/null
+@@ -1,40 +0,0 @@
+-#
+-# Copyright (C) 2019 The Android Open Source Project
+-#
+-# Licensed under the Apache License, Version 2.0 (the "License");
+-# you may not use this file except in compliance with the License.
+-# You may obtain a copy of the License at
+-#
+-#      http://www.apache.org/licenses/LICENSE-2.0
+-#
+-# Unless required by applicable law or agreed to in writing, software
+-# distributed under the License is distributed on an "AS IS" BASIS,
+-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+-# See the License for the specific language governing permissions and
+-# limitations under the License.
+-#
+-
+-import numpy
+-
+-input0 = Input("input0", "TENSOR_QUANT8_ASYMM", "{1, 1, 2, 3}, 2.3, 4")
+-paddings = Parameter("paddings", "TENSOR_INT32", "{4, 2}", [1, 2,
+-                                                            3, 4,
+-                                                            3, 3,
+-                                                            2, 1])
+-pad_value = Int32Scalar("pad_value", 3)
+-output0 = Output("output0", "TENSOR_QUANT8_ASYMM", "{4, 8, 8, 6}, 2.3, 4")
+-
+-model = Model().Operation("PAD_V2", input0, paddings, pad_value).To(output0)
+-
+-Example({
+-    input0: [1, 2, 3,
+-             4, 5, 6],
+-    output0: np.pad([[[[1, 2, 3],
+-                       [4, 5, 6]]]],
+-                    [[1, 2],
+-                     [3, 4],
+-                     [3, 3],
+-                     [2, 1]],
+-                    "constant",
+-                    constant_values=3).flatten().tolist(),
+-})
+diff --git a/tests/nnapi/specs/skip/V1_2/pad_v2_low_rank.mod.py b/tests/nnapi/specs/skip/V1_2/pad_v2_low_rank.mod.py
+deleted file mode 100644
+index 391d5cf..0000000
+--- a/tests/nnapi/specs/skip/V1_2/pad_v2_low_rank.mod.py
++++ /dev/null
+@@ -1,27 +0,0 @@
+-#
+-# Copyright (C) 2019 The Android Open Source Project
+-#
+-# Licensed under the Apache License, Version 2.0 (the "License");
+-# you may not use this file except in compliance with the License.
+-# You may obtain a copy of the License at
+-#
+-#      http://www.apache.org/licenses/LICENSE-2.0
+-#
+-# Unless required by applicable law or agreed to in writing, software
+-# distributed under the License is distributed on an "AS IS" BASIS,
+-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+-# See the License for the specific language governing permissions and
+-# limitations under the License.
+-#
+-
+-input0 = Input("input0", "TENSOR_FLOAT32", "{3}")
+-paddings = Parameter("paddings", "TENSOR_INT32", "{1, 2}", [3, 1])
+-pad_value = Float32Scalar("pad_value", 9.9)
+-output0 = Output("output0", "TENSOR_FLOAT32", "{7}")
+-
+-model = Model().Operation("PAD_V2", input0, paddings, pad_value).To(output0)
+-
+-Example({
+-    input0: [1.0, 2.0, 3.0],
+-    output0: [9.9, 9.9, 9.9, 1.0, 2.0, 3.0, 9.9],
+-}).AddVariations("float16")
+diff --git a/tests/nnapi/specs/skip/V1_2/pad_v2_low_rank_quant8.mod.py b/tests/nnapi/specs/skip/V1_2/pad_v2_low_rank_quant8.mod.py
+deleted file mode 100644
+index b67c2b8..0000000
+--- a/tests/nnapi/specs/skip/V1_2/pad_v2_low_rank_quant8.mod.py
++++ /dev/null
+@@ -1,27 +0,0 @@
+-#
+-# Copyright (C) 2019 The Android Open Source Project
+-#
+-# Licensed under the Apache License, Version 2.0 (the "License");
+-# you may not use this file except in compliance with the License.
+-# You may obtain a copy of the License at
+-#
+-#      http://www.apache.org/licenses/LICENSE-2.0
+-#
+-# Unless required by applicable law or agreed to in writing, software
+-# distributed under the License is distributed on an "AS IS" BASIS,
+-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+-# See the License for the specific language governing permissions and
+-# limitations under the License.
+-#
+-
+-input0 = Input("input0", "TENSOR_QUANT8_ASYMM", "{3}, 2.3, 4")
+-paddings = Parameter("paddings", "TENSOR_INT32", "{1, 2}", [3, 1])
+-pad_value = Int32Scalar("pad_value", 9)
+-output0 = Output("output0", "TENSOR_QUANT8_ASYMM", "{7}, 2.3, 4")
+-
+-model = Model().Operation("PAD_V2", input0, paddings, pad_value).To(output0)
+-
+-Example({
+-    input0: [1, 2, 3],
+-    output0: [9, 9, 9, 1, 2, 3, 9],
+-})
+diff --git a/tests/nnapi/specs/skip/V1_2/quantize.mod.py b/tests/nnapi/specs/skip/V1_2/quantize.mod.py
+deleted file mode 100644
+index a42624d..0000000
+--- a/tests/nnapi/specs/skip/V1_2/quantize.mod.py
++++ /dev/null
+@@ -1,69 +0,0 @@
+-#
+-# Copyright (C) 2018 The Android Open Source Project
+-#
+-# Licensed under the Apache License, Version 2.0 (the "License");
+-# you may not use this file except in compliance with the License.
+-# You may obtain a copy of the License at
+-#
+-#      http://www.apache.org/licenses/LICENSE-2.0
+-#
+-# Unless required by applicable law or agreed to in writing, software
+-# distributed under the License is distributed on an "AS IS" BASIS,
+-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+-# See the License for the specific language governing permissions and
+-# limitations under the License.
+-#
+-
+-import numpy as np
+-
+-num_values = 300
+-values = list(np.linspace(-10, 10, num_values))
+-
+-for input_type in ["TENSOR_FLOAT32", "TENSOR_FLOAT16"]:
+-  for scale, offset in [(1.0, 0),
+-                        (1.0, 1),
+-                        (0.01, 120),
+-                        (10.0, 120)]:
+-    input0 = Input("input0", input_type, "{%d}" % num_values)
+-    output0 = Output("output0", input_type, "{%d}" % num_values)
+-
+-    model = Model().Operation("QUANTIZE", input0).To(output0)
+-
+-    quantizeOutput = DataTypeConverter().Identify({
+-        output0: ["TENSOR_QUANT8_ASYMM", scale, offset],
+-    })
+-
+-    Example({
+-        input0: values,
+-        output0: values,
+-    }).AddVariations(quantizeOutput, includeDefault=False)
+-
+-
+-# Zero-sized input
+-
+-# Use BOX_WITH_NMS_LIMIT op to generate a zero-sized internal tensor for box cooridnates.
+-p1 = Parameter("scores", "TENSOR_FLOAT32", "{1, 2}", [0.90, 0.10]) # scores
+-p2 = Parameter("roi", "TENSOR_FLOAT32", "{1, 8}", [1, 1, 10, 10, 0, 0, 10, 10]) # roi
+-o1 = Output("scoresOut", "TENSOR_FLOAT32", "{0}") # scores out
+-o2 = Output("classesOut", "TENSOR_INT32", "{0}") # classes out
+-tmp1 = Internal("roiOut", "TENSOR_FLOAT32", "{0, 4}") # roi out
+-tmp2 = Internal("batchSplitOut", "TENSOR_INT32", "{0}") # batch split out
+-model = Model("zero_sized").Operation("BOX_WITH_NMS_LIMIT", p1, p2, [0], 0.3,  -1, 0, 0.4, 1.0, 0.3).To(o1, tmp1, o2, tmp2)
+-
+-# Use ROI_ALIGN op to convert into zero-sized feature map.
+-layout = BoolScalar("layout", False) # NHWC
+-i1 = Input("in", "TENSOR_FLOAT32", "{1, 1, 1, 1}")
+-zero_sized = Internal("featureMap", "TENSOR_FLOAT32", "{0, 2, 2, 1}")
+-model = model.Operation("ROI_ALIGN", i1, tmp1, tmp2, 2, 2, 2.0, 2.0, 4, 4, layout).To(zero_sized)
+-
+-# QUANTIZE op with numBatches = 0.
+-o3 = Output("out", "TENSOR_QUANT8_ASYMM", "{0, 2, 2, 1}, 0.1f, 128") # out
+-model = model.Operation("QUANTIZE", zero_sized).To(o3)
+-
+-# Create test case with dummy values.
+-Example({
+-    i1: [1],
+-    o1: [0],
+-    o2: [0],
+-    o3: [0],
+-}).AddVariations("relaxed", "float16")
+diff --git a/tests/nnfw_api/src/ValidationTestAddModelLoaded.cc b/tests/nnfw_api/src/ValidationTestAddModelLoaded.cc
+index 67f2467..c6c6355 100644
+--- a/tests/nnfw_api/src/ValidationTestAddModelLoaded.cc
++++ b/tests/nnfw_api/src/ValidationTestAddModelLoaded.cc
+@@ -51,19 +51,24 @@ TEST_F(ValidationTestAddModelLoaded, output_tensorinfo)
+   ASSERT_EQ(tensor_info.dims[0], 1);
+ }
+ 
+-TEST_F(ValidationTestAddModelLoaded, neg_run_001)
++TEST_F(ValidationTestAddModelLoaded, neg_run)
+ {
+-  ASSERT_EQ(nnfw_run(_session), NNFW_STATUS_ERROR);
++  // nnfw_prepare is not called
++  ASSERT_EQ(nnfw_run(_session), NNFW_STATUS_INVALID_STATE);
+ }
+ 
+-TEST_F(ValidationTestAddModelLoaded, neg_set_input_001)
++TEST_F(ValidationTestAddModelLoaded, neg_set_input)
+ {
+-  ASSERT_EQ(nnfw_set_input(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, nullptr, 0), NNFW_STATUS_ERROR);
++  // nnfw_prepare is not called
++  ASSERT_EQ(nnfw_set_input(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, nullptr, 0),
++            NNFW_STATUS_INVALID_STATE);
+ }
+ 
+-TEST_F(ValidationTestAddModelLoaded, neg_set_output_001)
++TEST_F(ValidationTestAddModelLoaded, neg_set_output)
+ {
+-  ASSERT_EQ(nnfw_set_output(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, nullptr, 0), NNFW_STATUS_ERROR);
++  // nnfw_prepare is not called
++  ASSERT_EQ(nnfw_set_output(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, nullptr, 0),
++            NNFW_STATUS_INVALID_STATE);
+ }
+ 
+ TEST_F(ValidationTestAddModelLoaded, neg_get_input_size)
+@@ -81,7 +86,7 @@ TEST_F(ValidationTestAddModelLoaded, neg_load_model)
+   // load model twice
+   ASSERT_EQ(nnfw_load_model_from_file(
+                 _session, NNPackages::get().getModelAbsolutePath(NNPackages::ADD).c_str()),
+-            NNFW_STATUS_ERROR);
++            NNFW_STATUS_INVALID_STATE);
+ }
+ 
+ TEST_F(ValidationTestAddModelLoaded, neg_output_tensorinfo)
+diff --git a/tests/nnfw_api/src/ValidationTestAddSessionPrepared.cc b/tests/nnfw_api/src/ValidationTestAddSessionPrepared.cc
+index 1bb4182..0f4a4af 100644
+--- a/tests/nnfw_api/src/ValidationTestAddSessionPrepared.cc
++++ b/tests/nnfw_api/src/ValidationTestAddSessionPrepared.cc
+@@ -102,7 +102,7 @@ TEST_F(ValidationTestAddSessionPrepared, neg_run_during_async_run)
+ {
+   SetInOutBuffers();
+   ASSERT_EQ(nnfw_run_async(_session), NNFW_STATUS_NO_ERROR);
+-  EXPECT_EQ(nnfw_run(_session), NNFW_STATUS_ERROR);
++  EXPECT_EQ(nnfw_run(_session), NNFW_STATUS_INVALID_STATE);
+   ASSERT_EQ(nnfw_await(_session), NNFW_STATUS_NO_ERROR);
+ }
+ 
+@@ -152,13 +152,13 @@ TEST_F(ValidationTestAddSessionPrepared, neg_load_model)
+   // Load model twice
+   ASSERT_EQ(nnfw_load_model_from_file(
+                 _session, NNPackages::get().getModelAbsolutePath(NNPackages::ADD).c_str()),
+-            NNFW_STATUS_ERROR);
++            NNFW_STATUS_INVALID_STATE);
+ }
+ 
+ TEST_F(ValidationTestAddSessionPrepared, neg_prepare)
+ {
+   // Call Prepare twice
+-  ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_ERROR);
++  ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_INVALID_STATE);
+ }
+ 
+ // TODO Validation check when "nnfw_run" is called without input & output tensor setting
+diff --git a/tests/nnfw_api/src/ValidationTestSessionCreated.cc b/tests/nnfw_api/src/ValidationTestSessionCreated.cc
+index 2675aa7..01832db 100644
+--- a/tests/nnfw_api/src/ValidationTestSessionCreated.cc
++++ b/tests/nnfw_api/src/ValidationTestSessionCreated.cc
+@@ -58,7 +58,7 @@ TEST_F(ValidationTestSessionCreated, neg_load_invalid_package_1)
+       nnfw_load_model_from_file(
+           _session, NNPackages::get().getModelAbsolutePath(NNPackages::ADD_NO_MANIFEST).c_str()),
+       NNFW_STATUS_ERROR);
+-  ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_ERROR);
++  ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_INVALID_STATE);
+ }
+ 
+ TEST_F(ValidationTestSessionCreated, neg_load_invalid_package_2)
+@@ -67,52 +67,52 @@ TEST_F(ValidationTestSessionCreated, neg_load_invalid_package_2)
+                 _session,
+                 NNPackages::get().getModelAbsolutePath(NNPackages::ADD_INVALID_MANIFEST).c_str()),
+             NNFW_STATUS_ERROR);
+-  ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_ERROR);
++  ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_INVALID_STATE);
+ }
+ 
+ TEST_F(ValidationTestSessionCreated, neg_prepare_001)
+ {
+   // nnfw_load_model_from_file was not called
+-  ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_ERROR);
++  ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_INVALID_STATE);
+ }
+ 
+ TEST_F(ValidationTestSessionCreated, neg_run_001)
+ {
+   // nnfw_load_model_from_file and nnfw_prepare was not called
+-  ASSERT_EQ(nnfw_run(_session), NNFW_STATUS_ERROR);
++  ASSERT_EQ(nnfw_run(_session), NNFW_STATUS_INVALID_STATE);
+ }
+ 
+ TEST_F(ValidationTestSessionCreated, neg_set_input_001)
+ {
+-  // Invalid state
+-  ASSERT_EQ(nnfw_set_input(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, nullptr, 0), NNFW_STATUS_ERROR);
++  ASSERT_EQ(nnfw_set_input(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, nullptr, 0),
++            NNFW_STATUS_INVALID_STATE);
+ }
+ 
+ TEST_F(ValidationTestSessionCreated, neg_set_output_001)
+ {
+-  // Invalid state
+-  ASSERT_EQ(nnfw_set_output(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, nullptr, 0), NNFW_STATUS_ERROR);
++  ASSERT_EQ(nnfw_set_output(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, nullptr, 0),
++            NNFW_STATUS_INVALID_STATE);
+ }
+ 
+ TEST_F(ValidationTestSessionCreated, neg_get_input_size)
+ {
+   uint32_t size = 10000;
+-  ASSERT_EQ(nnfw_input_size(_session, &size), NNFW_STATUS_ERROR);
+-  ASSERT_EQ(size, 10000);
++  ASSERT_EQ(nnfw_input_size(_session, &size), NNFW_STATUS_INVALID_STATE);
++  ASSERT_EQ(size, 10000); // Remain unchanged
+ }
+ 
+ TEST_F(ValidationTestSessionCreated, neg_get_output_size)
+ {
+   uint32_t size = 10000;
+-  ASSERT_EQ(nnfw_output_size(_session, &size), NNFW_STATUS_ERROR);
+-  ASSERT_EQ(size, 10000);
++  ASSERT_EQ(nnfw_output_size(_session, &size), NNFW_STATUS_INVALID_STATE);
++  ASSERT_EQ(size, 10000); // Remain unchanged
+ }
+ 
+ TEST_F(ValidationTestSessionCreated, neg_output_tensorinfo)
+ {
+   nnfw_tensorinfo tensor_info;
+   // model is not loaded
+-  ASSERT_EQ(nnfw_output_tensorinfo(_session, 0, &tensor_info), NNFW_STATUS_ERROR);
++  ASSERT_EQ(nnfw_output_tensorinfo(_session, 0, &tensor_info), NNFW_STATUS_INVALID_STATE);
+   // model is not loaded and tensor_info is null
+-  ASSERT_EQ(nnfw_output_tensorinfo(_session, 0, nullptr), NNFW_STATUS_ERROR);
++  ASSERT_EQ(nnfw_output_tensorinfo(_session, 0, nullptr), NNFW_STATUS_INVALID_STATE);
+ }
+diff --git a/tests/scripts/benchmark_nnapi.sh b/tests/scripts/benchmark_nnapi.sh
+index c7f44c5..af79728 100755
+--- a/tests/scripts/benchmark_nnapi.sh
++++ b/tests/scripts/benchmark_nnapi.sh
+@@ -18,7 +18,6 @@ MY_PATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+ 
+ source $MY_PATH/common.sh
+ 
+-BENCHMARK_RUN_TEST_SH=
+ BENCHMARK_DRIVER_BIN=
+ BENCHMARK_REPORT_DIR=
+ BENCHMARK_MODELS_FILE=
+@@ -30,7 +29,7 @@ EXECUTORS="Linear Parallel" #TODO: accept this list as argument
+ 
+ function Usage()
+ {
+-    echo "Usage: ./$0 --reportdir=. --runtestsh=tests/scripts/framework/run_test.sh --driverbin=Product/out/bin/tflite_run"
++    echo "Usage: ./$0 --reportdir=. --driverbin=Product/out/bin/tflite_run"
+ }
+ 
+ for i in "$@"
+@@ -43,9 +42,6 @@ do
+         --test_op)
+             TEST_OP="true"
+             ;;
+-        --runtestsh=*)
+-            BENCHMARK_RUN_TEST_SH=${i#*=}
+-            ;;
+         --driverbin=*)
+             BENCHMARK_DRIVER_BIN=${i#*=}
+             ;;
+@@ -147,9 +143,8 @@ function run_onert_with_all_config()
+     local REPORT_MODEL_DIR=$2
+     local PAUSE_TIME_IN_SEC=$3
+     local BENCHMARK_DRIVER_BIN=$4
+-    local BENCHMARK_RUN_TEST_SH=$5
+-    local EXECUTORS=$6
+-    local BACKEND_LIST=$7
++    local EXECUTORS=$5
++    local BACKEND_LIST=$6
+ 
+     export USE_NNAPI=1
+ 
+@@ -163,18 +158,18 @@ function run_onert_with_all_config()
+     done
+     export BACKENDS=$BACKENDS_TO_USE
+     if [ "$TEST_OP" == "false" ]; then
+-        profile_for_he_shed $REPORT_MODEL_DIR $BENCHMARK_RUN_TEST_SH $BENCHMARK_DRIVER_BIN $MODEL $PROFILING_RUN_CNT
++        profile_for_he_shed $REPORT_MODEL_DIR $BENCHMARK_DRIVER_BIN $MODEL $PROFILING_RUN_CNT
+     fi
+ 
+     for executor in $EXECUTORS; do
+         export EXECUTOR=$executor
+         if [ "$TEST_OP" == "false" ]; then
+-            run_with_he_scheduler $REPORT_MODEL_DIR $BENCHMARK_RUN_TEST_SH $BENCHMARK_DRIVER_BIN $MODEL $executor
++            run_with_he_scheduler $REPORT_MODEL_DIR $BENCHMARK_DRIVER_BIN $MODEL $executor
+         fi
+         for backend in $BACKEND_LIST; do
+             export OP_BACKEND_ALLOPS=$backend
+             run_benchmark_and_print "tflite_onert_"$executor"_executor_$backend" "TFLite onert $executor Executor $backend"\
+-                                    $MODEL $REPORT_MODEL_DIR 0 $BENCHMARK_DRIVER_BIN $BENCHMARK_RUN_TEST_SH
++                                    $MODEL $REPORT_MODEL_DIR 0 $BENCHMARK_DRIVER_BIN
+         done
+     done
+     unset USE_NNAPI EXECUTOR OP_BACKEND_ALLOPS BACKENDS
+@@ -215,14 +210,14 @@ function run_benchmark_test()
+ 
+         # TFLite+CPU
+         unset USE_NNAPI
+-        run_benchmark_and_print "tflite_cpu" "TFLite CPU" $MODEL $REPORT_MODEL_DIR 0 $BENCHMARK_DRIVER_BIN $BENCHMARK_RUN_TEST_SH
++        run_benchmark_and_print "tflite_cpu" "TFLite CPU" $MODEL $REPORT_MODEL_DIR 0 $BENCHMARK_DRIVER_BIN
+ 
+         # run onert
+         if [ "$TEST_OP" == "true" ]; then
+           # Operation test don't need to test each scheduler
+-          run_onert_with_all_config $MODEL $REPORT_MODEL_DIR 0 $BENCHMARK_DRIVER_BIN $BENCHMARK_RUN_TEST_SH "Linear" "$BACKEND_LIST"
++          run_onert_with_all_config $MODEL $REPORT_MODEL_DIR 0 $BENCHMARK_DRIVER_BIN "Linear" "$BACKEND_LIST"
+         else
+-          run_onert_with_all_config $MODEL $REPORT_MODEL_DIR 0 $BENCHMARK_DRIVER_BIN $BENCHMARK_RUN_TEST_SH "$EXECUTORS" "$BACKEND_LIST"
++          run_onert_with_all_config $MODEL $REPORT_MODEL_DIR 0 $BENCHMARK_DRIVER_BIN "$EXECUTORS" "$BACKEND_LIST"
+         fi
+ 
+         if [[ $i -ne $(echo $BENCHMARK_MODEL_LIST | wc -w)-1 ]]; then
+diff --git a/tests/scripts/common.sh b/tests/scripts/common.sh
+index 8800290..b2799c2 100755
+--- a/tests/scripts/common.sh
++++ b/tests/scripts/common.sh
+@@ -18,13 +18,12 @@ MY_PATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+ 
+ function get_result_of_benchmark_test()
+ {
+-    local RUN_TEST_SH=$1
+-    local DRIVER_BIN=$2
+-    local MODEL=$3
+-    local LOG_FILE=$4
++    local DRIVER_BIN=$1
++    local MODEL=$2
++    local LOG_FILE=$3
+ 
+     local RET=0
+-    $RUN_TEST_SH --driverbin="$DRIVER_BIN  -r 5 -w 3" $MODEL > $LOG_FILE 2>&1
++    $MY_PATH/framework/run_test.sh --driverbin="$DRIVER_BIN  -r 5 -w 3" $MODEL > $LOG_FILE 2>&1
+     RET=$?
+     if [[ $RET -ne 0 ]]; then
+         echo "Testing $MODEL aborted... exit code: $RET"
+@@ -68,7 +67,7 @@ function run_benchmark_and_print()
+     LOG_FILE=$REPORT_MODEL_DIR/$WRITE_FILE_NAME.txt
+     RESULT_FILE=$REPORT_MODEL_DIR/$WRITE_FILE_NAME.result
+     print_with_dots $MSG
+-    RESULT=$(get_result_of_benchmark_test $BENCHMARK_RUN_TEST_SH $DRIVER_BIN $MODEL $LOG_FILE)
++    RESULT=$(get_result_of_benchmark_test $DRIVER_BIN $MODEL $LOG_FILE)
+     echo "$RESULT ms"
+     print_result_of_benchmark_test "$MSG" "$RESULT" $RESULT_FILE
+     sleep $PAUSE_TIME_IN_SEC
+diff --git a/tests/scripts/framework/run_test.sh b/tests/scripts/framework/run_test.sh
+index 44b7149..9440c52 100755
+--- a/tests/scripts/framework/run_test.sh
++++ b/tests/scripts/framework/run_test.sh
+@@ -28,10 +28,12 @@ function Usage()
+     echo "Usage: ./$0 --driverbin={such as tflite_run} {tests to test or empty for all of tests}"
+     echo "Usage: ./$0 --driverbin=Product/out/bin/tflite_run --reportdir=report --tapname=verification.tap avgpool1 avgpool2"
+     echo ""
+-    echo "--download            - (default=off) Download model files. Other options is ignored"
+-    echo "--driverbin           - (default=../../Product/out/bin/tflite_run) runner for runnning framework tests"
+-    echo "--reportdir           - (default=report) directory to place tap files"
+-    echo "--tapname             - (default=framework_test.tap) file name to be written for tap"
++    echo "--download            - (default=on) Download model files"
++    echo "--run                 - (default=on) Test model files"
++    echo "--driverbin           - (default=../../Product/out/bin/tflite_run) Runner for runnning model tests"
++    echo "--reportdir           - (default=report) Directory to place tap files"
++    echo "--tapname             - (default=framework_test.tap) File name to be written for tap"
++    echo "--md5                 - (default=on) MD5 check when download model files"
+     echo ""
+ }
+ 
+@@ -43,9 +45,13 @@ function need_download()
+         return 0;
+     fi
+     # Ignore checking md5 in cache
++    # TODO Use "--md5" option only and remove IGNORE_MD5 environment variable
+     if [ ! -z $IGNORE_MD5 ] && [ "$IGNORE_MD5" == "1" ]; then
+         return 1
+     fi
++    if [ "$MD5_CHECK" = "off" ]; then
++        return 1
++    fi
+ 
+     LOCAL_HASH=$(md5sum $LOCAL_PATH | awk '{ print $1 }')
+     REMOTE_HASH=$(curl -ss $REMOTE_URL | md5sum  | awk '{ print $1 }')
+@@ -60,7 +66,9 @@ function need_download()
+ DRIVER_BIN=""
+ TAP_NAME="framework_test.tap"
+ TEST_LIST=()
+-DOWNLOAD_MODE="off"
++DOWNLOAD_MODEL="on"
++RUN_TEST="on"
++MD5_CHECK="on"
+ 
+ # Support environment variable setting for mirror server
+ FIXED_MODELFILE_SERVER="${MODELFILE_SERVER:-}"
+@@ -84,6 +92,12 @@ do
+         --download=*)
+             DOWNLOAD_MODE=${i#*=}
+             ;;
++        --md5=*)
++            MD5_CHECK=${i#*=}
++            ;;
++        --run=*)
++            RUN_TEST=${i#*=}
++            ;;
+         *)
+             TEST_LIST+=( $i )
+             ;;
+@@ -100,7 +114,7 @@ if [ ! -n "$DRIVER_BIN" ]; then
+ fi
+ 
+ # Check test driver setting
+-if [ ! -e $DRIVER_BIN ] && [ "$DOWNLOAD_MODE" != "on" ]; then
++if [ ! -e $DRIVER_BIN ] && [ "$RUN_TEST" = "on" ]; then
+     echo "Cannot find test driver" $DRIVER_BIN ": please set proper DRIVER_BIN"
+     exit 1
+ fi
+@@ -139,33 +153,9 @@ run_tests()
+ 
+         TEST_CACHE_PATH=$CACHE_ROOT_PATH/$TEST_NAME
+         MODELFILE=$TEST_CACHE_PATH/$MODELFILE_NAME
+-        MODELFILE_URL="$MODELFILE_SERVER_PATH/$MODELFILE_NAME"
+-        if [ -n  "$FIXED_MODELFILE_SERVER" ]; then
+-            MODELFILE_URL="$FIXED_MODELFILE_SERVER/$MODELFILE_NAME"
+-        fi
+-
+-        # Download model file
+-        if [ ! -e $TEST_CACHE_PATH ]; then
+-            mkdir -p $TEST_CACHE_PATH
+-        fi
+-
+-        # Download unless we have it in cache (Also check md5sum)
+-        if need_download "$MODELFILE" "$MODELFILE_URL"; then
+-            echo ""
+-            echo "Download test file for $TEST_NAME"
+-            echo "======================"
+-
+-            rm -f $MODELFILE # Remove invalid file if exists
+-            pushd $TEST_CACHE_PATH
+-            wget -nv $MODELFILE_URL
+-            if [ "${MODELFILE_NAME##*.}" == "zip" ]; then
+-                unzip -o $MODELFILE_NAME
+-            fi
+-            popd
+-        fi
+ 
+         # Find model file for downloaded by zip
+-        if [ "${MODELFILE_NAME##*.}" == "zip" ]; then
++        if [ "${MODELFILE_NAME##*.}" = "zip" ]; then
+             pushd $TEST_CACHE_PATH
+             MODELFILE=$TEST_CACHE_PATH/$(ls *.tflite)
+             popd
+@@ -178,7 +168,6 @@ run_tests()
+         # Run driver to test framework
+         $DRIVER_BIN $MODELFILE
+ 
+-        #$DRIVER_BIN $MODELFILE
+         if [[ $? -eq 0 ]]; then
+             echo "ok $i - $TEST_NAME" >> $REPORT_DIR/$TAP_NAME
+         else
+@@ -268,10 +257,11 @@ find_tests()
+ mkdir -p $REPORT_DIR
+ TESTS_TO_RUN=$(find_tests ${TEST_LIST[@]})
+ 
+-if [[ "$DOWNLOAD_MODE" == "on" ]]; then
++if [ "$DOWNLOAD_MODEL" = "on" ]; then
+     download_tests $TESTS_TO_RUN
+-    exit 0;
+ fi
+ 
+-run_tests $TESTS_TO_RUN
++if [ "$RUN_TEST" = "on" ]; then
++    run_tests $TESTS_TO_RUN
++fi
+ exit $?
+diff --git a/tests/scripts/test-driver.sh b/tests/scripts/test-driver.sh
+index 615fc2c..a720b15 100755
+--- a/tests/scripts/test-driver.sh
++++ b/tests/scripts/test-driver.sh
+@@ -38,7 +38,6 @@ function Usage()
+     echo "etc."
+     echo "--framework_driverbin     - (default=../../Product/out/bin/tflite_run) runner for runnning framework tests"
+     echo "--verification_driverbin  - (default=../../Product/out/bin/nnapi_test) runner for runnning verification tests"
+-    echo "--runtestsh               - (default=\$ARTIFACT_PATH/tests/scripts/framework/run_test.sh) run_test.sh with path where it is for framework test and verification"
+     echo "--unittestdir             - (default=\$ARTIFACT_PATH/Product/out/unittest) directory that has unittest binaries for unit test"
+     echo ""
+     echo "--reportdir               - (default=\$ARTIFACT_PATH/report) directory to save report"
+@@ -49,7 +48,6 @@ TEST_DRIVER_DIR="$( cd "$( dirname "${BASH_SOURCE}" )" && pwd )"
+ ARTIFACT_PATH="$TEST_DRIVER_DIR/../../"
+ FRAMEWORK_DRIVER_BIN=""
+ VERIFICATION_DRIVER_BIN=""
+-RUN_TEST_SH=""
+ UNIT_TEST_DIR=""
+ ALLTEST_ON="true"
+ UNITTEST_ON="false"
+@@ -74,9 +72,6 @@ do
+         --verification_driverbin=*)
+             VERIFICATION_DRIVER_BIN=${i#*=}
+             ;;
+-        --runtestsh=*)
+-            RUN_TEST_SH=${i#*=}
+-            ;;
+         --unittestdir=*)
+             UNIT_TEST_DIR=${i#*=}
+             ;;
+@@ -116,15 +111,6 @@ done
+ 
+ ARTIFACT_PATH="$(readlink -f $ARTIFACT_PATH)"
+ 
+-if [ -z "$RUN_TEST_SH" ]; then
+-    RUN_TEST_SH=$ARTIFACT_PATH/tests/scripts/framework/run_test.sh
+-fi
+-
+-if [ ! -e "$RUN_TEST_SH" ]; then
+-    echo "Cannot find $RUN_TEST_SH"
+-    exit 1
+-fi
+-
+ if [ -z "$UNIT_TEST_DIR" ]; then
+     UNIT_TEST_DIR=$ARTIFACT_PATH/Product/out/unittest
+ fi
+@@ -149,7 +135,6 @@ if [ "$FRAMEWORKTEST_ON" == "true" ]; then
+     fi
+ 
+     $TEST_DRIVER_DIR/test_framework.sh \
+-        --runtestsh=$RUN_TEST_SH \
+         --driverbin=$FRAMEWORK_DRIVER_BIN \
+         --reportdir=$REPORT_DIR \
+         --tapname=framework_test.tap \
+@@ -166,7 +151,6 @@ if [ "$ALLTEST_ON" == "true" ] || [ "$VERIFICATION_ON" == "true" ]; then
+ 
+     # verification uses the same script as frameworktest does
+     $TEST_DRIVER_DIR/test_framework.sh \
+-        --runtestsh=$RUN_TEST_SH \
+         --driverbin=$VERIFICATION_DRIVER_BIN \
+         --reportdir=$REPORT_DIR \
+         --tapname=verification_test.tap \
+@@ -180,7 +164,6 @@ if [ "$BENCHMARK_ONERT_OP_ON" == "true" ]; then
+ 
+     $TEST_DRIVER_DIR/benchmark_nnapi.sh \
+         --test_op \
+-        --runtestsh=$RUN_TEST_SH \
+         --driverbin=$DRIVER_BIN \
+         --reportdir=$REPORT_DIR/benchmark_op \
+         --modelfilepath=$ARTIFACT_PATH/tests/scripts/framework
+diff --git a/tests/scripts/test_framework.sh b/tests/scripts/test_framework.sh
+index 1d97515..bd86cd3 100755
+--- a/tests/scripts/test_framework.sh
++++ b/tests/scripts/test_framework.sh
+@@ -14,7 +14,8 @@
+ # See the License for the specific language governing permissions and
+ # limitations under the License.
+ 
+-FWTEST_RUN_TEST_SH=
++MY_PATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
++
+ FWTEST_DRIVER_BIN=
+ FWTEST_REPORT_DIR=
+ FWTEST_TAP_NAME=
+@@ -25,7 +26,6 @@ function Usage()
+ {
+     echo "Usage Example:"
+     echo "./$0 \\"
+-    echo "  --runtestsh=tests/scripts/framework/run_test.sh \\ # Test runner script path"
+     echo "  --driverbin=Product/out/bin/tflite_run \\  # Test driver path"
+     echo "  --frameworktest_list_file=tests/scripts/list/frameworktest_list.armv7l.cpu.txt \\"
+     echo "  --reportdir=report \\            # Directory for the report files will be saved"
+@@ -42,9 +42,6 @@ do
+         -h|--help|help)
+             Usage
+             ;;
+-        --runtestsh=*)
+-            FWTEST_RUN_TEST_SH=${i#*=}
+-            ;;
+         --driverbin=*)
+             FWTEST_DRIVER_BIN=${i#*=}
+             ;;
+@@ -67,7 +64,6 @@ do
+     shift
+ done
+ 
+-[ ! -z "$FWTEST_RUN_TEST_SH" ] || Usage
+ [ ! -z "$FWTEST_DRIVER_BIN" ] || Usage
+ [ ! -z "$FWTEST_REPORT_DIR" ] || Usage
+ [ ! -z "$FWTEST_TAP_NAME" ] || Usage
+@@ -86,7 +82,7 @@ if [ ! -z "$FRAMEWORKTEST_LIST_FILE" ]; then
+     MODELLIST=$(cat "${FRAMEWORKTEST_LIST_FILE}")
+ fi
+ 
+-$FWTEST_RUN_TEST_SH --driverbin=$FWTEST_DRIVER_BIN \
++$MY_PATH/framework/run_test.sh --driverbin=$FWTEST_DRIVER_BIN \
+     --reportdir=$FWTEST_REPORT_DIR \
+     --tapname=$FWTEST_TAP_NAME \
+     ${MODELLIST:-} \
+diff --git a/tests/tools/nnpackage_run/CMakeLists.txt b/tests/tools/nnpackage_run/CMakeLists.txt
+index 0e333a0..ec45db4 100644
+--- a/tests/tools/nnpackage_run/CMakeLists.txt
++++ b/tests/tools/nnpackage_run/CMakeLists.txt
+@@ -33,7 +33,7 @@ target_include_directories(nnpackage_run PRIVATE src)
+ target_include_directories(nnpackage_run PRIVATE ${Boost_INCLUDE_DIRS})
+ 
+ target_link_libraries(nnpackage_run onert_core onert tflite_loader)
+-target_link_libraries(nnpackage_run tensorflow-lite ${LIB_PTHREAD} dl nnfw_lib_tflite jsoncpp)
++target_link_libraries(nnpackage_run nnfw_lib_tflite jsoncpp)
+ target_link_libraries(nnpackage_run nnfw-dev)
+ target_link_libraries(nnpackage_run ${Boost_PROGRAM_OPTIONS_LIBRARY})
+ target_link_libraries(nnpackage_run nnfw_lib_benchmark)
+diff --git a/tests/tools/nnpackage_run/src/args.cc b/tests/tools/nnpackage_run/src/args.cc
+index 0dbcafc..cb4a7db 100644
+--- a/tests/tools/nnpackage_run/src/args.cc
++++ b/tests/tools/nnpackage_run/src/args.cc
+@@ -16,6 +16,7 @@
+ 
+ #include "args.h"
+ 
++#include <functional>
+ #include <iostream>
+ #include <json/json.h>
+ 
+@@ -105,6 +106,75 @@ Args::Args(const int argc, char **argv)
+ 
+ void Args::Initialize(void)
+ {
++  auto process_nnpackage = [&](const std::string &package_filename) {
++    _package_filename = package_filename;
++
++    std::cerr << "Package Filename " << _package_filename << std::endl;
++    if (_package_filename.empty())
++    {
++      // TODO Print usage instead of the below message
++      std::cerr << "Please specify nnpackage file. Run with `--help` for usage."
++                << "\n";
++
++      exit(1);
++    }
++    else
++    {
++      if (access(_package_filename.c_str(), F_OK) == -1)
++      {
++        std::cerr << "nnpackage not found: " << _package_filename << "\n";
++      }
++    }
++  };
++
++  auto process_output_sizes = [&](const std::string &output_sizes_json_str) {
++    Json::Value root;
++    Json::Reader reader;
++    if (!reader.parse(output_sizes_json_str, root, false))
++    {
++      std::cerr << "Invalid JSON format for output_sizes \"" << output_sizes_json_str << "\"\n";
++      exit(1);
++    }
++
++    auto arg_map = argArrayToMap(root);
++    for (auto &pair : arg_map)
++    {
++      uint32_t key = pair.first;
++      Json::Value &val_json = pair.second;
++      if (!val_json.isUInt())
++      {
++        std::cerr << "All the values in `output_sizes` must be unsigned integers\n";
++        exit(1);
++      }
++      uint32_t val = val_json.asUInt();
++      _output_sizes[key] = val;
++    }
++  };
++
++  auto process_shape_prepare = [&](const std::string &shape_str) {
++    try
++    {
++      handleShapeParam(_shape_prepare, shape_str);
++    }
++    catch (const std::exception &e)
++    {
++      std::cerr << "error with '--shape_prepare' option: " << shape_str << std::endl;
++      exit(1);
++    }
++  };
++
++  auto process_shape_run = [&](const std::string &shape_str) {
++    try
++    {
++      handleShapeParam(_shape_run, shape_str);
++    }
++    catch (const std::exception &e)
++    {
++      std::cerr << "error with '--shape_run' option: " << shape_str << std::endl;
++      exit(1);
++    }
++  };
++
+   // General options
+   po::options_description general("General options", 100);
+ 
+@@ -112,32 +182,33 @@ void Args::Initialize(void)
+   general.add_options()
+     ("help,h", "Print available options")
+     ("version", "Print version and exit immediately")
+-    ("nnpackage", po::value<std::string>()->required())
++    ("nnpackage", po::value<std::string>()->required()->notifier(process_nnpackage))
+ #if defined(ONERT_HAVE_HDF5) && ONERT_HAVE_HDF5 == 1
+-    ("dump,d", po::value<std::string>()->default_value(""), "Output filename")
+-    ("load,l", po::value<std::string>()->default_value(""), "Input filename")
++    ("dump,d", po::value<std::string>()->default_value("")->notifier([&](const auto &v) { _dump_filename = v; }), "Output filename")
++    ("load,l", po::value<std::string>()->default_value("")->notifier([&](const auto &v) { _load_filename = v; }), "Input filename")
+ #endif
+-    ("output_sizes", po::value<std::string>(),
++    ("output_sizes", po::value<std::string>()->notifier(process_output_sizes),
+         "The output buffer size in JSON 1D array\n"
+         "If not given, the model's output sizes are used\n"
+         "e.g. '[0, 40, 2, 80]' to set 0th tensor to 40 and 2nd tensor to 80.\n")
+-    ("num_runs,r", po::value<int>()->default_value(1), "The number of runs")
+-    ("warmup_runs,w", po::value<int>()->default_value(0), "The number of warmup runs")
+-    ("run_delay,t", po::value<int>()->default_value(-1), "Delay time(ms) between runs (as default no delay")
+-    ("gpumem_poll,g", po::value<bool>()->default_value(false), "Check gpu memory polling separately")
+-    ("mem_poll,m", po::value<bool>()->default_value(false), "Check memory polling")
+-    ("write_report,p", po::value<bool>()->default_value(false),
++    ("num_runs,r", po::value<int>()->default_value(1)->notifier([&](const auto &v) { _num_runs = v; }), "The number of runs")
++    ("warmup_runs,w", po::value<int>()->default_value(0)->notifier([&](const auto &v) { _warmup_runs = v; }), "The number of warmup runs")
++    ("run_delay,t", po::value<int>()->default_value(-1)->notifier([&](const auto &v) { _run_delay = v; }), "Delay time(ms) between runs (as default no delay")
++    ("gpumem_poll,g", po::value<bool>()->default_value(false)->notifier([&](const auto &v) { _gpumem_poll = v; }), "Check gpu memory polling separately")
++    ("mem_poll,m", po::value<bool>()->default_value(false)->notifier([&](const auto &v) { _mem_poll = v; }), "Check memory polling")
++    ("write_report,p", po::value<bool>()->default_value(false)->notifier([&](const auto &v) { _write_report = v; }),
+          "Write report\n"
+          "{exec}-{nnpkg}-{backend}.csv will be generated.\n"
+          "e.g. nnpackage_run-UNIT_Add_000-acl_cl.csv.\n"
+          "{nnpkg} name may be changed to realpath if you use symbolic-link.")
+-    ("shape_prepare", po::value<std::string>()->default_value("[]"),
++    ("shape_prepare", po::value<std::string>()->default_value("[]")->notifier(process_shape_prepare),
+          "set shape of specified tensor before compilation\n"
+          "e.g. '[0, [1, 2], 2, []]' to set 0th tensor to [1, 2] and 2nd tensor to [].\n")
+-    ("shape_run", po::value<std::string>()->default_value("[]"),
++    ("shape_run", po::value<std::string>()->default_value("[]")->notifier(process_shape_run),
+          "set shape of specified tensor right before running\n"
+          "e.g. '[1, [1, 2]]` to set 1st tensor to [1, 2].\n")
+-    ("verbose_level,v", po::value<int>()->default_value(0), "Verbose level\n"
++    ("verbose_level,v", po::value<int>()->default_value(0)->notifier([&](const auto &v) { _verbose_level = v; }),
++         "Verbose level\n"
+          "0: prints the only result. Messages btw run don't print\n"
+          "1: prints result and message btw run\n"
+          "2: prints all of messages to print\n")
+@@ -180,158 +251,23 @@ void Args::Parse(const int argc, char **argv)
+     return;
+   }
+ 
+-  po::notify(vm);
+   try
+   {
+-#if defined(ONERT_HAVE_HDF5) && ONERT_HAVE_HDF5 == 1
+-    if (vm.count("dump"))
+-    {
+-      _dump_filename = vm["dump"].as<std::string>();
+-    }
+-
+-    if (vm.count("load"))
+-    {
+-      _load_filename = vm["load"].as<std::string>();
+-    }
+-#endif
+-
+-    if (vm.count("nnpackage"))
+-    {
+-      _package_filename = vm["nnpackage"].as<std::string>();
+-
+-      if (_package_filename.empty())
+-      {
+-        // TODO Print usage instead of the below message
+-        std::cerr << "Please specify nnpackage file. Run with `--help` for usage."
+-                  << "\n";
+-
+-        exit(1);
+-      }
+-      else
+-      {
+-        if (access(_package_filename.c_str(), F_OK) == -1)
+-        {
+-          std::cerr << "nnpackage not found: " << _package_filename << "\n";
+-        }
+-      }
+-    }
+-
+-    if (vm.count("output_sizes"))
+-    {
+-      auto output_sizes_json_str = vm["output_sizes"].as<std::string>();
+-
+-      Json::Value root;
+-      Json::Reader reader;
+-      if (!reader.parse(output_sizes_json_str, root, false))
+-      {
+-        std::cerr << "Invalid JSON format for output_sizes \"" << output_sizes_json_str << "\"\n";
+-        exit(1);
+-      }
+-
+-      auto arg_map = argArrayToMap(root);
+-      for (auto &pair : arg_map)
+-      {
+-        uint32_t key = pair.first;
+-        Json::Value &val_json = pair.second;
+-        if (!val_json.isUInt())
+-        {
+-          std::cerr << "All the values in `output_sizes` must be unsigned integers\n";
+-          exit(1);
+-        }
+-        uint32_t val = val_json.asUInt();
+-        _output_sizes[key] = val;
+-      }
+-    }
+-
+-    if (vm.count("num_runs"))
+-    {
+-      _num_runs = vm["num_runs"].as<int>();
+-    }
+-
+-    if (vm.count("warmup_runs"))
+-    {
+-      _warmup_runs = vm["warmup_runs"].as<int>();
+-    }
+-
+-    if (vm.count("run_delay"))
+-    {
+-      _run_delay = vm["run_delay"].as<int>();
+-    }
+-
+-    if (vm.count("gpumem_poll"))
+-    {
+-      _gpumem_poll = vm["gpumem_poll"].as<bool>();
+-    }
+-
+-    if (vm.count("mem_poll"))
+-    {
+-      _mem_poll = vm["mem_poll"].as<bool>();
+-      // Instead of EXECUTE to avoid overhead, memory polling runs on WARMUP
+-      if (_mem_poll && _warmup_runs == 0)
+-      {
+-        _warmup_runs = 1;
+-      }
+-    }
+-
+-    if (vm.count("write_report"))
+-    {
+-      _write_report = vm["write_report"].as<bool>();
+-    }
+-
+-    if (vm.count("verbose_level"))
+-    {
+-      _verbose_level = vm["verbose_level"].as<int>();
+-    }
++    po::notify(vm);
+   }
+   catch (const std::bad_cast &e)
+   {
+-    std::cerr << "error by bad cast" << e.what() << '\n';
++    std::cerr << "Bad cast error - " << e.what() << '\n';
+     exit(1);
+   }
+ 
+-  if (vm.count("shape_prepare"))
+-  {
+-    std::string shape_str;
+-    try
+-    {
+-      shape_str = vm["shape_prepare"].as<std::string>();
+-    }
+-    catch (const std::bad_cast &e)
+-    {
+-      std::cerr << "error by bad cast with '--shape_prepare' option" << e.what() << '\n';
+-      exit(1);
+-    }
+-    try
+-    {
+-      handleShapeParam(_shape_prepare, shape_str);
+-    }
+-    catch (const std::exception &e)
+-    {
+-      std::cerr << "error with '--shape_prepare' option: " << shape_str << std::endl;
+-      exit(1);
+-    }
+-  }
+-
+-  if (vm.count("shape_run"))
++  // This must be run after `notify` as `_warm_up_runs` must have been processed before.
++  if (vm.count("mem_poll"))
+   {
+-    std::string shape_str;
+-    try
+-    {
+-      shape_str = vm["shape_run"].as<std::string>();
+-    }
+-    catch (const std::bad_cast &e)
++    // Instead of EXECUTE to avoid overhead, memory polling runs on WARMUP
++    if (_mem_poll && _warmup_runs == 0)
+     {
+-      std::cerr << "error by bad cast with '--shape_run' option" << e.what() << '\n';
+-      exit(1);
+-    }
+-    try
+-    {
+-      handleShapeParam(_shape_run, shape_str);
+-    }
+-    catch (const std::exception &e)
+-    {
+-      std::cerr << "error with '--shape_run' option: " << shape_str << std::endl;
+-      exit(1);
++      _warmup_runs = 1;
+     }
+   }
+ }
+diff --git a/tests/tools/nnpackage_run/src/h5formatter.cc b/tests/tools/nnpackage_run/src/h5formatter.cc
+index 34c075c..09ace47 100644
+--- a/tests/tools/nnpackage_run/src/h5formatter.cc
++++ b/tests/tools/nnpackage_run/src/h5formatter.cc
+@@ -145,6 +145,7 @@ void H5Formatter::dumpOutputs(const std::string &filename, std::vector<Allocatio
+           data_set.write(outputs[i].data(), H5::PredType::NATIVE_INT64);
+           break;
+         }
++        case NNFW_TYPE_TENSOR_UINT8:
+         case NNFW_TYPE_TENSOR_QUANT8_ASYMM:
+         {
+           H5::DataSet data_set =
+@@ -159,13 +160,6 @@ void H5Formatter::dumpOutputs(const std::string &filename, std::vector<Allocatio
+           data_set.write(outputs[i].data(), H5::PredType::NATIVE_INT8);
+           break;
+         }
+-        case NNFW_TYPE_TENSOR_UINT8:
+-        {
+-          H5::DataSet data_set =
+-              value_group.createDataSet(std::to_string(i), H5::PredType::STD_U8BE, data_space);
+-          data_set.write(outputs[i].data(), H5::PredType::NATIVE_UINT8);
+-          break;
+-        }
+         default:
+           throw std::runtime_error("nnpkg_run can dump f32, i32, qasymm8, bool and uint8.");
+       }
+diff --git a/tests/tools/tflite_loader/CMakeLists.txt b/tests/tools/tflite_loader/CMakeLists.txt
+index 5a9e3a8..0fe1c69 100644
+--- a/tests/tools/tflite_loader/CMakeLists.txt
++++ b/tests/tools/tflite_loader/CMakeLists.txt
+@@ -17,7 +17,7 @@ add_executable(tflite_loader_test_tool ${SOURCES})
+ target_include_directories(tflite_loader_test_tool PRIVATE ${Boost_INCLUDE_DIRS})
+ 
+ target_link_libraries(tflite_loader_test_tool onert_core onert tflite_loader)
+-target_link_libraries(tflite_loader_test_tool nnfw_lib_tflite tensorflow-lite ${LIB_PTHREAD} dl nnfw_lib_misc)
++target_link_libraries(tflite_loader_test_tool nnfw_lib_tflite nnfw_lib_misc)
+ target_link_libraries(tflite_loader_test_tool ${Boost_PROGRAM_OPTIONS_LIBRARY} ${Boost_SYSTEM_LIBRARY} ${Boost_FILESYSTEM_LIBRARY})
+ 
+ install(TARGETS tflite_loader_test_tool DESTINATION bin)
+diff --git a/tests/tools/tflite_run/CMakeLists.txt b/tests/tools/tflite_run/CMakeLists.txt
+index 19e7126..3f30d3e 100644
+--- a/tests/tools/tflite_run/CMakeLists.txt
++++ b/tests/tools/tflite_run/CMakeLists.txt
+@@ -13,7 +13,7 @@ add_executable(tflite_run ${TFLITE_RUN_SRCS})
+ target_include_directories(tflite_run PRIVATE src)
+ target_include_directories(tflite_run PRIVATE ${Boost_INCLUDE_DIRS})
+ 
+-target_link_libraries(tflite_run tensorflow-lite ${LIB_PTHREAD} dl nnfw_lib_tflite)
++target_link_libraries(tflite_run nnfw_lib_tflite)
+ target_link_libraries(tflite_run ${Boost_PROGRAM_OPTIONS_LIBRARY})
+ 
+ target_link_libraries(tflite_run nnfw_lib_benchmark)
+diff --git a/tools/nnpackage_tool/nncc-tc-to-nnpkg-tc/nncc-tc-to-nnpkg-tc.sh b/tools/nnpackage_tool/nncc-tc-to-nnpkg-tc/nncc-tc-to-nnpkg-tc.sh
+index cf3e544..bbc5b3e 100755
+--- a/tools/nnpackage_tool/nncc-tc-to-nnpkg-tc/nncc-tc-to-nnpkg-tc.sh
++++ b/tools/nnpackage_tool/nncc-tc-to-nnpkg-tc/nncc-tc-to-nnpkg-tc.sh
+@@ -62,6 +62,7 @@ tflite
+ "
+ 
+ model_type=""
++tf_intf_version=""
+ 
+ for ext in $supported_model_types; do
+   [ -e "$indir/$tcname"."$ext" ] && model_type=$ext
+@@ -73,7 +74,9 @@ if [[ "$model_type" == "" ]]; then
+ fi
+ 
+ if [[ "$model_type" == "pb" ]]; then
+-  $tf2nnpkg --info "$indir/$tcname".info --graphdef "$indir/$tcname"."$model_type" -o "$outdir"
++  [ -f "$indir/$tcname"."v2" ] && tf_intf_version="--v2"
++  $tf2nnpkg --info "$indir/$tcname".info --graphdef "$indir/$tcname"."$model_type" \
++  "$tf_intf_version" -o "$outdir"
+ else
+   $model2nnpkg -o "$outdir" "$indir/$tcname"."$model_type"
+ fi
+diff --git a/tools/tflitefile_tool/select_operator.py b/tools/tflitefile_tool/select_operator.py
+index 1ad44a3..333ca32 100755
+--- a/tools/tflitefile_tool/select_operator.py
++++ b/tools/tflitefile_tool/select_operator.py
+@@ -1,4 +1,4 @@
+-#!/usr/bin/python
++#!/usr/bin/env python
+ 
+ # Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ #
+@@ -1180,23 +1180,6 @@ def GenerateModel(args, new_builder, sample_model, operator_list, new_input_tens
+     return tflite.Model.ModelEnd(new_builder)
+ 
+ 
+-def Finish(new_builder, new_model):
+-    # Cusrom implementation: identifier
+-    # Python API don't support identifier input yet
+-    # Reference: Finish(self, rootTable)) in builder.py, Finish(uoffset_t root, const char *file_identifier, bool size_prefix) in flatbuffers.h
+-    new_builder.Prep(new_builder.minalign,
+-                     flatbuffers.number_types.UOffsetTFlags.bytewidth)
+-
+-    new_builder.PrependByte(0x33)
+-    new_builder.PrependByte(0x4c)
+-    new_builder.PrependByte(0x46)
+-    new_builder.PrependByte(0x54)
+-
+-    new_builder.PrependUOffsetTRelative(new_model)
+-    new_builder.finished = True
+-    return new_builder.Head()
+-
+-
+ def main(args):
+     input_model_file = args.input_model
+     oplist_file = args.opcode_list
+@@ -1343,7 +1326,7 @@ def main(args):
+                               new_input_tensors, new_output_tensors, used_tensors_dic,
+                               used_buffers_dic, used_opcodes_dic, used_subgraphs_dic)
+ 
+-    Finish(new_builder, new_model)
++    new_builder.Finish(new_model, file_identifier=b'TFL3')
+     new_buf = new_builder.Output()
+ 
+     output_model_file.write(new_buf)
+diff --git a/tools/tflkit/README.md b/tools/tflkit/README.md
+index a0c40c6..9e18834 100644
+--- a/tools/tflkit/README.md
++++ b/tools/tflkit/README.md
+@@ -1,4 +1,4 @@
+-# tflkit
++ï»¿# tflkit
+ 
+ ## Purpose
+ 
+@@ -114,11 +114,11 @@ Number of all operators                       :  126 	 (total instrs: 11,484,469
+ 
+ ### TensorFlow
+ 
+-TensorFlow provides some kinds of converting guideline. In Python, the [TFLiteConverter](https://www.tensorflow.org/api_docs/python/tf/contrib/lite/TFLiteConverter) class will help you to convert a TensorFlow GraphDef or SavedModel into `output_format` using TOCO. The `output_format` can be `TFLITE` or `GRAPHVIZ_DOT` format. The default `output_format` is `TFLITE`. And there is a Python command line interface for running TOCO, and its name is [tflite_convert](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/python/tflite_convert.py). This converts a TensorFlow GraphDef or SavedModel into `TFLITE` or `GRAPHVIZ_DOT` format like [TFLiteConverter](https://www.tensorflow.org/api_docs/python/tf/contrib/lite/TFLiteConverter). These two way also supports to convert a TensorFlow Keras model into `output_format`. Both functions are implemented using a tool called TOCO.
++TensorFlow provides some kinds of converting guideline. In Python, the [TFLiteConverter](https://www.tensorflow.org/api_docs/python/tf/lite/TFLiteConverter) class will help you to convert a TensorFlow GraphDef or SavedModel into `output_format` using TOCO. The `output_format` can be `TFLITE` or `GRAPHVIZ_DOT` format. The default `output_format` is `TFLITE`. And there is a Python command line interface for running TOCO, and its name is [tflite_convert](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/python/tflite_convert.py). This converts a TensorFlow GraphDef or SavedModel into `TFLITE` or `GRAPHVIZ_DOT` format like [TFLiteConverter](https://www.tensorflow.org/api_docs/python/tf/lite/TFLiteConverter). These two way also supports to convert a TensorFlow Keras model into `output_format`. Both functions are implemented using a tool called TOCO.
+ 
+ ### with tflkit
+ 
+-The tflkit uses the [tflite_convert](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/python/tflite_convert.py) python command line interface to convert a TensorFlow model into TfLite model. It only supports to convert a TensorFlow GraphDef file into `TFLITE` format file. This tool supports the creation of individual `TFLITE` files for different input shapes. When converting to multiple `TFLITE` files, it needs to put a string called `NAME` in `TFLITE_PATH`. The string `NAME` will be replaced by what is listed in teh `NAME` environment. This tool requires an information file as a parameter. There is an [example file](info/convert.template) for a convert information. The `--tensorflow_path` and `--tensorflow_version` can change the TensorFlow location. By default, it uses `externals/tensorflow` directory.
++The tflkit uses the [tflite_convert](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/python/tflite_convert.py) python command line interface to convert a TensorFlow model into TfLite model. It only supports to convert a TensorFlow GraphDef file into `TFLITE` format file. This tool supports the creation of individual `TFLITE` files for different input shapes. When converting to multiple `TFLITE` files, it needs to put a string called `NAME` in `TFLITE_PATH`. The string `NAME` will be replaced by what is listed in the `NAME` environment. This tool requires an information file as a parameter. There is an [example file](convert.template) for a convert information. The `--tensorflow_path` and `--tensorflow_version` can change the TensorFlow location. By default, it uses `externals/tensorflow` directory.
+ 
+ Convert information:
+   * GRAPHDEF_PATH : Full filepath of file containing frozen TensorFlow GraphDef.
+@@ -176,7 +176,7 @@ The input and output file of this tool is a TensorFlow GraphDef file.
+ 
+ ### with tflkit
+ 
+-The [optimize_for_inference.sh](optimize_for_inference.sh) file invokes the TensorFlow [optimize tool](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/optimize_for_inference.py). This tool requires a optimize information file as a parameter. Here is an [example file](info/optimize.template) for this tool. The information file needs `INPUT` and `OUTPUT` array names. The [summarize_pb.sh](summarize_pb.sh) file will help you to define the `INPUT` and `OUTPUT` array names. The `--tensorflow_path` can change the TensorFlow location. By default, it uses `externals/tensorflow` directory.
++The [optimize_for_inference.sh](optimize_for_inference.sh) file invokes the TensorFlow [optimize tool](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/optimize_for_inference.py). This tool requires a optimize information file as a parameter. Here is an [example file](optimize.template) for this tool. The information file needs `INPUT` and `OUTPUT` array names. The [summarize_pb.sh](summarize_pb.sh) file will help you to define the `INPUT` and `OUTPUT` array names. The `--tensorflow_path` can change the TensorFlow location. By default, it uses `externals/tensorflow` directory.
+ 
+ Optimize information:
+   * GRAPHDEF_PATH : Full filepath of file containing frozen TensorFlow GraphDef.
+@@ -207,7 +207,7 @@ The trained TensorFlow model can be trasformed by some variants to deploy it in
+ 
+ ### with tflkit
+ 
+-The [transform_graph.sh](transform_graph.sh) file supports to transform a TensorFlow GraphDef using various transform options. This tool requires a transform information file as a parameter and the transform options are described in the information file. There is an [example file](info/transform.template) for this tool. The information file needs `INPUT` and `OUTPUT` array names. The [summarize_pb.sh](summarize_pb.sh) file will help you to define the `INPUT` and `OUTPUT` array names. The `--tensorflow_path` can change the TensorFlow location. By default, it uses `externals/tensorflow` directory.
++The [transform_graph.sh](transform_graph.sh) file supports to transform a TensorFlow GraphDef using various transform options. This tool requires a transform information file as a parameter and the transform options are described in the information file. There is an [example file](transform.template) for this tool. The information file needs `INPUT` and `OUTPUT` array names. The [summarize_pb.sh](summarize_pb.sh) file will help you to define the `INPUT` and `OUTPUT` array names. The `--tensorflow_path` can change the TensorFlow location. By default, it uses `externals/tensorflow` directory.
+ 
+ Transform information:
+   * GRAPHDEF_PATH : Full filepath of file containing frozen TensorFlow GraphDef.
+@@ -270,7 +270,7 @@ The [freeze_graph](https://github.com/tensorflow/tensorflow/blob/master/tensorfl
+ 
+ ### with tflkit
+ 
+-The tflkit provides the simple way to create a frozen graph using [freeze_graph](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/freeze_graph.py) tool. This tool requires an information file as a parameter. There is an [example file](info/freeze.info) for a freeze tool. Either `SAVED_MODEL` or `META_GRAPH` must be declared. And `META_GRAPH` is always used with `CKPT_PATH`. The `--tensorflow_path` can change the TensorFlow location. By default, it uses `externals/tensorflow` directory.
++The tflkit provides the simple way to create a frozen graph using [freeze_graph](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/freeze_graph.py) tool. This tool requires an information file as a parameter. There is an [example file](freeze.template) for a freeze tool. Either `SAVED_MODEL` or `META_GRAPH` must be declared. And `META_GRAPH` is always used with `CKPT_PATH`. The `--tensorflow_path` can change the TensorFlow location. By default, it uses `externals/tensorflow` directory.
+ 
+ Freeze information:
+   * SAVED_MODEL : Full directory path with TensorFlow `SavedModel` file and variables.
+diff --git a/tools/update_version/update-version b/tools/update_version/update-version
+index 4169327..1b77c10 100644
+--- a/tools/update_version/update-version
++++ b/tools/update_version/update-version
+@@ -40,11 +40,12 @@ fi
+ 
+ version=$1
+ 
+-sed -i "s/^release = .*/release = \'$version\'/" ${nnfw_root}/docs/conf.py
+-sed -i "s/^Version: .*/Version: $version/" ${nnfw_root}/packaging/nnfw.spec
++perl -pi -e "s/^release = .*/release = \'$version\'/" ${nnfw_root}/docs/conf.py
+ 
+-IFS=. read M m p <<< $version
++perl -pi -e "s/^Version: .*/Version: $version/" ${nnfw_root}/packaging/nnfw.spec
++
++IFS=. read M m p <<< "$version"
+ hex=$(printf '0x%08x' $(( (($M << 24)) | (($m << 8)) | $p )))
+-sed -i "s/^#define NNFW_VERSION.*/#define NNFW_VERSION $hex/" ${nnfw_root}/runtime/onert/api/include/nnfw_version.h
++perl -pi -e "s/^#define NNFW_VERSION.*/#define NNFW_VERSION $hex/" ${nnfw_root}/runtime/onert/api/include/nnfw_version.h
+ 
+-sed -i "s/versionName .*$/versionName \"$version\"/" ${nnfw_root}/runtime/contrib/android/api/build.gradle
++perl -pi -e "s/versionName .*$/versionName \"$version\"/" ${nnfw_root}/runtime/contrib/android/api/build.gradle
diff --git a/packaging/nnfw.spec b/packaging/nnfw.spec
index ce1cd0b..e26ffcb 100644
--- a/packaging/nnfw.spec
+++ b/packaging/nnfw.spec
@@ -30,7 +30,7 @@ BuildRequires:  flatbuffers-devel
 %ifarch %{arm} aarch64
 # Require python for acl-ex library build pre-process
 BuildRequires:  python
-BuildRequires:  libarmcl-devel
+BuildRequires:  libarmcl-devel >= v20.05
 %endif
 
 Requires(post): /sbin/ldconfig
diff --git a/res/TensorFlowLiteRecipes/AveragePool2D_U8_000/test.recipe b/res/TensorFlowLiteRecipes/AveragePool2D_U8_000/test.recipe
new file mode 100644
index 0000000..7322e90
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/AveragePool2D_U8_000/test.recipe
@@ -0,0 +1,26 @@
+operand {
+  name: "ifm"
+  type: UINT8
+  shape { dim: 1 dim: 8 dim: 8 dim: 1 }
+  quant { min: 0 max: 255 scale: 1.0 zero_point: 0 }
+}
+operand {
+  name: "ofm"
+  type: UINT8
+  shape { dim: 1 dim: 7 dim: 7 dim: 1 }
+  quant { min: 0 max: 255 scale: 1.0 zero_point: 0 }
+}
+operation {
+  type: "AveragePool2D"
+  averagepool2d_options {
+    padding: VALID
+    stride_w: 1
+    stride_h: 1
+    filter_width: 2
+    filter_height: 2
+  }
+  input: "ifm"
+  output: "ofm"
+}
+input: "ifm"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/AveragePool2D_U8_000/test.reverse b/res/TensorFlowLiteRecipes/AveragePool2D_U8_000/test.reverse
new file mode 100644
index 0000000..e69de29
diff --git a/res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.recipe b/res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.recipe
new file mode 100644
index 0000000..a09afc1
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.recipe
@@ -0,0 +1,44 @@
+operand {
+  name: "ifm"
+  type: FLOAT32
+  shape { dim: 1 dim: 4 dim: 5 dim: 5 }
+}
+operand {
+  name: "ker"
+  type: FLOAT32
+  shape { dim: 1 dim: 1 dim: 2 dim: 25 }
+}
+operand {
+  name: "bias"
+  type: FLOAT32
+  shape { dim: 25 }
+  filler {
+    tag: "constant"
+    arg: "1.1"
+  }
+}
+operand {
+  name: "ofm"
+  type: FLOAT32
+  shape { dim: 1 dim: 2 dim: 2 dim: 25 }
+}
+operation {
+  type: "DepthwiseConv2D"
+  version: 2
+  depthwiseconv2d_options {
+    padding: VALID
+    stride_w: 2
+    stride_h: 2
+    dilation_w_factor: 2
+    dilation_h_factor: 1
+    depth_multiplier: 5
+    activation : RELU6
+  }
+  input: "ifm"
+  input: "ker"
+  input: "bias"
+  output: "ofm"
+}
+input: "ifm"
+input: "ker"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.reverse b/res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.reverse
new file mode 100644
index 0000000..e69de29
diff --git a/res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.rule b/res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.rule
new file mode 100644
index 0000000..edfabc6
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.rule
@@ -0,0 +1,3 @@
+# To check if DEPTHWISE_CONV_2D version is 2
+
+RULE    "OP_VERSION_CHECK"        $(op_version DEPTHWISE_CONV_2D) '=' 2
diff --git a/res/TensorFlowLiteRecipes/DepthwiseConv2D_U8_001/test.recipe b/res/TensorFlowLiteRecipes/DepthwiseConv2D_U8_001/test.recipe
new file mode 100644
index 0000000..5e0b6b5
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/DepthwiseConv2D_U8_001/test.recipe
@@ -0,0 +1,61 @@
+operand {
+  name: "ifm"
+  type: UINT8
+  shape { dim: 1 dim: 112 dim: 112 dim: 4 }
+  quant { min: 0 max: 6 scale: 0.0235294 zero_point: 0 }
+}
+operand {
+  name: "ker"
+  type: UINT8
+  shape { dim: 1 dim: 3 dim: 3 dim: 4 }
+  filler {
+    tag: "gaussian"
+    arg: "0.0"
+    arg: "1.0"
+  }
+  quant {
+    min: -30.3175 min: -0.779597 min: -10.2751 min: -10.8594
+    max: 4.35049 max: 2.70807 max: 11.0269 max: 20.97
+    scale:0.135953 scale: 0.0136771 scale: 0.0835375 scale: 0.124821
+    zero_point:223 zero_point: 57 zero_point: 123 zero_point: 87
+    quantized_dimension: 3
+  }
+}
+operand {
+  name: "bias"
+  type: INT32
+  shape { dim: 4 }
+	filler {
+	  tag: "gaussian"
+	  arg: "0"
+	  arg: "1.0"
+	}
+  quant {
+    scale: 1.4758e-16 scale: 3.15185e-05 scale: 2.20685e-05 scale: 1.72205e-16
+    zero_point: 0 zero_point: 0 zero_point: 0 zero_point: 0
+  }
+}
+operand {
+  name: "ofm"
+  type: UINT8
+  shape { dim: 1 dim: 112 dim: 112 dim: 4 }
+  quant { min: 0 max: 6 scale: 0.0235294 zero_point: 0 }
+
+}
+operation {
+  type: "DepthwiseConv2D"
+  depthwiseconv2d_options {
+    padding: SAME
+    stride_w: 1
+    stride_h: 1
+    depth_multiplier: 1
+    activation : RELU6
+  }
+  input: "ifm"
+  input: "ker"
+  input: "bias"
+  output: "ofm"
+}
+input: "ifm"
+input: "ker"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/DepthwiseConv2D_U8_001/test.reverse b/res/TensorFlowLiteRecipes/DepthwiseConv2D_U8_001/test.reverse
new file mode 100644
index 0000000..e69de29
diff --git a/res/TensorFlowLiteRecipes/L2Normalize_U8_000/test.recipe b/res/TensorFlowLiteRecipes/L2Normalize_U8_000/test.recipe
new file mode 100644
index 0000000..3fff5cd
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/L2Normalize_U8_000/test.recipe
@@ -0,0 +1,22 @@
+operand {
+  name: "ifm1"
+  type: UINT8
+  shape { dim: 1 dim: 4 dim: 4 dim: 3 }
+  quant { min: 0 max: 2 scale: 0.0078125 zero_point: 128}
+}
+operand {
+  name: "ofm"
+  type: UINT8
+  shape { dim: 1 dim: 4 dim: 4 dim: 3 }
+  quant { min: 0 max: 2 scale: 0.0078125 zero_point: 128}
+}
+operation {
+  type: "L2Normalize"
+  l2norm_options {
+    activation: NONE
+  }
+  input: "ifm1"
+  output: "ofm"
+}
+input: "ifm1"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/L2Normalize_U8_000/test.reverse b/res/TensorFlowLiteRecipes/L2Normalize_U8_000/test.reverse
new file mode 100644
index 0000000..e69de29
diff --git a/res/TensorFlowLiteRecipes/Logistic_U8_000/test.recipe b/res/TensorFlowLiteRecipes/Logistic_U8_000/test.recipe
new file mode 100644
index 0000000..7b2a84d
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Logistic_U8_000/test.recipe
@@ -0,0 +1,19 @@
+operand {
+  name: "ifm"
+  type: UINT8
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+  quant { min: 0 max: 1 scale: 0.00390625 zero_point: -128 }
+}
+operand {
+  name: "ofm"
+  type: UINT8
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+  quant { min: 0 max: 1 scale: 0.00390625 zero_point: -128 }
+}
+operation {
+  type: "Logistic"
+  input: "ifm"
+  output: "ofm"
+}
+input: "ifm"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/Logistic_U8_000/test.reverse b/res/TensorFlowLiteRecipes/Logistic_U8_000/test.reverse
new file mode 100644
index 0000000..e69de29
diff --git a/res/TensorFlowLiteRecipes/TransposeConv_000/test.recipe b/res/TensorFlowLiteRecipes/TransposeConv_000/test.recipe
index 79271a4..1313e26 100644
--- a/res/TensorFlowLiteRecipes/TransposeConv_000/test.recipe
+++ b/res/TensorFlowLiteRecipes/TransposeConv_000/test.recipe
@@ -10,7 +10,7 @@ operand {
 operand {
   name: "ker"
   type: FLOAT32
-  shape { dim: 1 dim: 3 dim: 3 dim: 1 }
+  shape { dim: 3 dim: 1 dim: 1 dim: 3 }
   filler {
     tag: "gaussian"
     arg: "0.0"
diff --git a/res/TensorFlowLiteRecipes/Unique_000/test.recipe b/res/TensorFlowLiteRecipes/Unique_000/test.recipe
new file mode 100644
index 0000000..887380c
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Unique_000/test.recipe
@@ -0,0 +1,27 @@
+operand {
+  name: "ifm"
+  type: FLOAT32
+  shape { dim: 4 }
+}
+operand {
+  name: "ofm"
+  type: FLOAT32
+  shape { }
+}
+operand {
+  name: "ofm_idx"
+  type: INT32
+  shape { dim: 4 }
+}
+operation {
+  type: "Unique"
+  unique_options {
+    idx_out_type: INT32
+  }
+  input: "ifm"
+  output: "ofm"
+  output: "ofm_idx"
+}
+input: "ifm"
+output: "ofm"
+output: "ofm_idx"
diff --git a/res/TensorFlowLiteRecipes/Unique_000/test.reverse b/res/TensorFlowLiteRecipes/Unique_000/test.reverse
new file mode 100644
index 0000000..e69de29
diff --git a/res/TensorFlowLiteRecipes/Unique_001/test.recipe b/res/TensorFlowLiteRecipes/Unique_001/test.recipe
new file mode 100644
index 0000000..9beb516
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Unique_001/test.recipe
@@ -0,0 +1,27 @@
+operand {
+  name: "ifm"
+  type: FLOAT32
+  shape { dim: 4 }
+}
+operand {
+  name: "ofm"
+  type: FLOAT32
+  shape { }
+}
+operand {
+  name: "ofm_idx"
+  type: INT64
+  shape { dim: 4 }
+}
+operation {
+  type: "Unique"
+  unique_options {
+    idx_out_type: INT64
+  }
+  input: "ifm"
+  output: "ofm"
+  output: "ofm_idx"
+}
+input: "ifm"
+output: "ofm"
+output: "ofm_idx"
diff --git a/res/TensorFlowLiteRecipes/Unique_001/test.reverse b/res/TensorFlowLiteRecipes/Unique_001/test.reverse
new file mode 100644
index 0000000..e69de29
diff --git a/res/TensorFlowLiteRecipes/Unique_002/test.recipe b/res/TensorFlowLiteRecipes/Unique_002/test.recipe
new file mode 100644
index 0000000..67b947f
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Unique_002/test.recipe
@@ -0,0 +1,27 @@
+operand {
+  name: "ifm"
+  type: INT32
+  shape { dim: 5 }
+}
+operand {
+  name: "ofm"
+  type: INT32
+  shape { }
+}
+operand {
+  name: "ofm_idx"
+  type: INT32
+  shape { dim: 5 }
+}
+operation {
+  type: "Unique"
+  unique_options {
+    idx_out_type: INT32
+  }
+  input: "ifm"
+  output: "ofm"
+  output: "ofm_idx"
+}
+input: "ifm"
+output: "ofm"
+output: "ofm_idx"
diff --git a/res/TensorFlowLiteRecipes/Unique_002/test.reverse b/res/TensorFlowLiteRecipes/Unique_002/test.reverse
new file mode 100644
index 0000000..e69de29
diff --git a/res/TensorFlowLiteRecipes/Unique_003/test.recipe b/res/TensorFlowLiteRecipes/Unique_003/test.recipe
new file mode 100644
index 0000000..375db66
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Unique_003/test.recipe
@@ -0,0 +1,27 @@
+operand {
+  name: "ifm"
+  type: INT32
+  shape { dim: 5 }
+}
+operand {
+  name: "ofm"
+  type: INT32
+  shape { }
+}
+operand {
+  name: "ofm_idx"
+  type: INT64
+  shape { dim: 5 }
+}
+operation {
+  type: "Unique"
+  unique_options {
+    idx_out_type: INT64
+  }
+  input: "ifm"
+  output: "ofm"
+  output: "ofm_idx"
+}
+input: "ifm"
+output: "ofm"
+output: "ofm_idx"
diff --git a/res/TensorFlowLiteRecipes/Unique_003/test.reverse b/res/TensorFlowLiteRecipes/Unique_003/test.reverse
new file mode 100644
index 0000000..e69de29
diff --git a/res/TensorFlowLiteRecipes/Unique_U8_000/test.recipe b/res/TensorFlowLiteRecipes/Unique_U8_000/test.recipe
new file mode 100644
index 0000000..d3985e4
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Unique_U8_000/test.recipe
@@ -0,0 +1,28 @@
+operand {
+  name: "ifm"
+  type: UINT8
+  shape { dim: 4 }
+  quant { min: 0 max: 255 scale: 1.0 zero_point: 0 }
+}
+operand {
+  name: "ofm"
+  type: UINT8
+  shape { }
+}
+operand {
+  name: "ofm_idx"
+  type: INT32
+  shape { dim: 4 }
+}
+operation {
+  type: "Unique"
+  unique_options {
+    idx_out_type: INT32
+  }
+  input: "ifm"
+  output: "ofm"
+  output: "ofm_idx"
+}
+input: "ifm"
+output: "ofm"
+output: "ofm_idx"
diff --git a/res/TensorFlowLiteRecipes/Unique_U8_000/test.reverse b/res/TensorFlowLiteRecipes/Unique_U8_000/test.reverse
new file mode 100644
index 0000000..e69de29
diff --git a/res/TensorFlowLiteRecipes/Unique_U8_001/test.recipe b/res/TensorFlowLiteRecipes/Unique_U8_001/test.recipe
new file mode 100644
index 0000000..b08dd85
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Unique_U8_001/test.recipe
@@ -0,0 +1,28 @@
+operand {
+  name: "ifm"
+  type: UINT8
+  shape { dim: 5 }
+  quant { min: 0 max: 255 scale: 1.0 zero_point: 0 }
+}
+operand {
+  name: "ofm"
+  type: UINT8
+  shape { }
+}
+operand {
+  name: "ofm_idx"
+  type: INT64
+  shape { dim: 5 }
+}
+operation {
+  type: "Unique"
+  unique_options {
+    idx_out_type: INT64
+  }
+  input: "ifm"
+  output: "ofm"
+  output: "ofm_idx"
+}
+input: "ifm"
+output: "ofm"
+output: "ofm_idx"
diff --git a/res/TensorFlowLiteRecipes/Unique_U8_001/test.reverse b/res/TensorFlowLiteRecipes/Unique_U8_001/test.reverse
new file mode 100644
index 0000000..e69de29
diff --git a/runtime/libs/benchmark/CMakeLists.txt b/runtime/libs/benchmark/CMakeLists.txt
index 2af0ffa..748b2d1 100644
--- a/runtime/libs/benchmark/CMakeLists.txt
+++ b/runtime/libs/benchmark/CMakeLists.txt
@@ -1,6 +1,5 @@
 file(GLOB_RECURSE SOURCES "src/*.cpp")
 
-add_library(nnfw_lib_benchmark SHARED ${SOURCES})
+add_library(nnfw_lib_benchmark STATIC ${SOURCES})
 target_include_directories(nnfw_lib_benchmark PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include)
 target_link_libraries(nnfw_lib_benchmark PRIVATE ${LIB_PTHREAD})
-install(TARGETS nnfw_lib_benchmark DESTINATION lib)
diff --git a/runtime/libs/benchmark/src/Result.cpp b/runtime/libs/benchmark/src/Result.cpp
index 7a3f9a5..df573da 100644
--- a/runtime/libs/benchmark/src/Result.cpp
+++ b/runtime/libs/benchmark/src/Result.cpp
@@ -166,7 +166,7 @@ Result::Result(const Phases &phases)
   if (option.memory)
   {
     print_memory = true;
-    for (int i = PhaseEnum::MODEL_LOAD; i <= PhaseEnum::EXECUTE; ++i)
+    for (int i = PhaseEnum::MODEL_LOAD; i < PhaseEnum::EXECUTE; ++i)
     {
       auto phase = phases.at(gPhaseStrings[i]);
       for (int j = MemoryType::RSS; j <= MemoryType::PSS; ++j)
diff --git a/runtime/onert/api/include/nnfw.h b/runtime/onert/api/include/nnfw.h
index 031aabd..03a3aed 100644
--- a/runtime/onert/api/include/nnfw.h
+++ b/runtime/onert/api/include/nnfw.h
@@ -99,6 +99,8 @@ typedef enum {
   NNFW_STATUS_ERROR = 1,
   /** Unexpected null argument is given. */
   NNFW_STATUS_UNEXPECTED_NULL = 2,
+  /** When a function was called but it is not valid for the current session state. */
+  NNFW_STATUS_INVALID_STATE = 3,
 } NNFW_STATUS;
 
 /**
@@ -432,10 +434,10 @@ NNFW_STATUS nnfw_output_tensorinfo(nnfw_session *session, uint32_t index,
  *
  * <p>Supported backends differs on each platforms.
  * For example, `x86_64` supports "cpu" only.
- * Can set multiple backends by semicolon (ex: "acl_cl;cpu").
- * Among the multiple backends, the 1st element is used as default backend.</p>
- *
- * @note      Possible backend strings are: "cpu", "acl_cl", "acl_neon", "srcn"
+ * Multiple backends can be set and they must be separated by a semicolon (ex: "acl_cl;cpu").
+ * For each backend string, `libbackend_{backend}.so` will be dynamically loaded during
+ * {@link nnfw_prepare}.
+ * Among the multiple backends, the 1st element is used as the default backend.</p>
  *
  * @param[in] session session to which avilable backends are set
  * @param[in] backends available backends on which nnfw uses
@@ -449,12 +451,10 @@ NNFW_STATUS nnfw_set_available_backends(nnfw_session *session, const char *backe
  *
  * This function should be called before {@link nnfw_prepare} is invoked.
  *
- * <p>Supported backends differs on each platforms.
- * For example, `x86_64` supports "cpu" only.
- * The backend for op has higher priority than available backends specified by
- * nnfw_set_available_backends.</p>
+ * <p>The backend for op has higher priority than available backends specified by
+ * {@link nnfw_set_available_backends}.</p>
  *
- * @note      Possible backend strings are: "cpu", "acl_cl", "acl_neon"
+ * @deprecated Deprecated since 1.8.0.
  *
  * @param[in] session session to be modified
  * @param[in] op operation to be set
diff --git a/runtime/onert/api/src/nnfw_api.cc b/runtime/onert/api/src/nnfw_api.cc
index 0747583..34a46ed 100644
--- a/runtime/onert/api/src/nnfw_api.cc
+++ b/runtime/onert/api/src/nnfw_api.cc
@@ -31,6 +31,7 @@ STATIC_ASSERT_ENUM_CHECK(NNFW_TYPE_TENSOR_INT64, 5);
 STATIC_ASSERT_ENUM_CHECK(NNFW_STATUS_NO_ERROR, 0);
 STATIC_ASSERT_ENUM_CHECK(NNFW_STATUS_ERROR, 1);
 STATIC_ASSERT_ENUM_CHECK(NNFW_STATUS_UNEXPECTED_NULL, 2);
+STATIC_ASSERT_ENUM_CHECK(NNFW_STATUS_INVALID_STATE, 3);
 
 STATIC_ASSERT_ENUM_CHECK(NNFW_LAYOUT_NONE, 0);
 STATIC_ASSERT_ENUM_CHECK(NNFW_LAYOUT_CHANNELS_LAST, 1);
diff --git a/runtime/onert/api/src/nnfw_api_internal.cc b/runtime/onert/api/src/nnfw_api_internal.cc
index d03ddd4..b3390fa 100644
--- a/runtime/onert/api/src/nnfw_api_internal.cc
+++ b/runtime/onert/api/src/nnfw_api_internal.cc
@@ -76,7 +76,7 @@ nnfw_session::~nnfw_session() = default;
 NNFW_STATUS nnfw_session::load_model_from_file(const char *package_dir)
 {
   if (!isStateInitialized())
-    return NNFW_STATUS_ERROR;
+    return NNFW_STATUS_INVALID_STATE;
 
   if (!package_dir)
   {
@@ -156,7 +156,7 @@ NNFW_STATUS nnfw_session::prepare()
       std::cerr << "invalid state";
     }
     std::cerr << std::endl;
-    return NNFW_STATUS_ERROR;
+    return NNFW_STATUS_INVALID_STATE;
   }
 
   if (!_subgraphs || !primary_subgraph() || primary_subgraph()->isBuildingPhase())
@@ -188,7 +188,7 @@ NNFW_STATUS nnfw_session::run()
   {
     std::cerr << "Error during nnfw_session::run : "
               << "run should be run after prepare" << std::endl;
-    return NNFW_STATUS_ERROR;
+    return NNFW_STATUS_INVALID_STATE;
   }
 
   try
@@ -211,7 +211,7 @@ NNFW_STATUS nnfw_session::run_async()
   {
     std::cerr << "Error during nnfw_session::run_async : "
               << "run_async should be run after prepare" << std::endl;
-    return NNFW_STATUS_ERROR;
+    return NNFW_STATUS_INVALID_STATE;
   }
 
   _execution->startExecute();
@@ -241,7 +241,7 @@ NNFW_STATUS nnfw_session::set_input(uint32_t index, NNFW_TYPE /*type*/, const vo
   if (!isStatePreparedOrFinishedRun())
   {
     std::cerr << "Error during nnfw_session::set_input : invalid state" << std::endl;
-    return NNFW_STATUS_ERROR;
+    return NNFW_STATUS_INVALID_STATE;
   }
 
   if (!buffer && length != 0)
@@ -270,7 +270,7 @@ NNFW_STATUS nnfw_session::set_output(uint32_t index, NNFW_TYPE /*type*/, void *b
   if (!isStatePreparedOrFinishedRun())
   {
     std::cerr << "Error during nnfw_session::set_output : invalid state" << std::endl;
-    return NNFW_STATUS_ERROR;
+    return NNFW_STATUS_INVALID_STATE;
   }
 
   if (!buffer && length != 0)
@@ -296,7 +296,7 @@ NNFW_STATUS nnfw_session::set_output(uint32_t index, NNFW_TYPE /*type*/, void *b
 NNFW_STATUS nnfw_session::input_size(uint32_t *number)
 {
   if (isStateInitialized()) // Model is not loaded
-    return NNFW_STATUS_ERROR;
+    return NNFW_STATUS_INVALID_STATE;
 
   try
   {
@@ -318,7 +318,7 @@ NNFW_STATUS nnfw_session::input_size(uint32_t *number)
 NNFW_STATUS nnfw_session::output_size(uint32_t *number)
 {
   if (isStateInitialized()) // Model is not loaded
-    return NNFW_STATUS_ERROR;
+    return NNFW_STATUS_INVALID_STATE;
 
   try
   {
@@ -410,7 +410,7 @@ NNFW_STATUS nnfw_session::apply_tensorinfo(uint32_t index, nnfw_tensorinfo ti)
     {
       std::cerr << "Error during set_input_tensorinfo : should be run after load_model"
                 << std::endl;
-      return NNFW_STATUS_ERROR;
+      return NNFW_STATUS_INVALID_STATE;
     }
 
     if (ti.rank <= 0 || ti.rank > NNFW_MAX_RANK)
@@ -463,6 +463,9 @@ NNFW_STATUS nnfw_session::set_input_tensorinfo(uint32_t index, const nnfw_tensor
 
 NNFW_STATUS nnfw_session::input_tensorinfo(uint32_t index, nnfw_tensorinfo *ti)
 {
+  if (isStateInitialized())
+    return NNFW_STATUS_INVALID_STATE;
+
   try
   {
     if (ti == nullptr)
@@ -499,7 +502,7 @@ NNFW_STATUS nnfw_session::input_tensorinfo(uint32_t index, nnfw_tensorinfo *ti)
 NNFW_STATUS nnfw_session::output_tensorinfo(uint32_t index, nnfw_tensorinfo *ti)
 {
   if (isStateInitialized())
-    return NNFW_STATUS_ERROR;
+    return NNFW_STATUS_INVALID_STATE;
 
   if (ti == nullptr)
   {
@@ -570,7 +573,7 @@ static std::string get_op_backend_string(std::string op)
 NNFW_STATUS nnfw_session::set_available_backends(const char *backends)
 {
   if (!isStateModelLoaded())
-    return NNFW_STATUS_ERROR;
+    return NNFW_STATUS_INVALID_STATE;
 
   try
   {
@@ -596,7 +599,7 @@ NNFW_STATUS nnfw_session::set_available_backends(const char *backends)
 NNFW_STATUS nnfw_session::set_op_backend(const char *op, const char *backend)
 {
   if (!isStateModelLoaded())
-    return NNFW_STATUS_ERROR;
+    return NNFW_STATUS_INVALID_STATE;
 
   try
   {
@@ -627,7 +630,7 @@ NNFW_STATUS nnfw_session::set_op_backend(const char *op, const char *backend)
 NNFW_STATUS nnfw_session::set_config(const char *key, const char *value)
 {
   if (!isStateModelLoaded())
-    return NNFW_STATUS_ERROR;
+    return NNFW_STATUS_INVALID_STATE;
 
   auto &options = _compiler->options();
 
@@ -693,7 +696,7 @@ onert::ir::Graph *nnfw_session::primary_subgraph()
 NNFW_STATUS nnfw_session::get_config(const char *key, char *value, size_t value_size)
 {
   if (!isStateModelLoaded())
-    return NNFW_STATUS_ERROR;
+    return NNFW_STATUS_INVALID_STATE;
 
   auto &options = _compiler->options();
 
diff --git a/runtime/onert/backend/acl_cl/KernelGenerator.cc b/runtime/onert/backend/acl_cl/KernelGenerator.cc
index 3ca4058..4ab2d4c 100644
--- a/runtime/onert/backend/acl_cl/KernelGenerator.cc
+++ b/runtime/onert/backend/acl_cl/KernelGenerator.cc
@@ -31,6 +31,7 @@
 #include "exec/FunctionSequence.h"
 #include "util/logging.h"
 #include "util/Utils.h"
+#include "AclKernelGen.h"
 
 namespace onert
 {
@@ -76,15 +77,15 @@ void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node)
   const auto block_size_index{
       node.getInputs().at(ir::operation::BatchToSpaceND::Input::BLOCK_SIZE)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
-  auto block_size_alloc = _tensor_builder->at(block_size_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+  auto block_size_tensor = _tensor_builder->at(block_size_index).get();
 
   assert(_ctx.at(block_size_index).data());
 
   auto fn = std::make_unique<::arm_compute::CLBatchToSpaceLayer>();
 
-  fn->configure(ifm_alloc->handle(), block_size_alloc->handle(), ofm_alloc->handle());
+  fn->configure(ifm_tensor->handle(), block_size_tensor->handle(), ofm_tensor->handle());
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -96,15 +97,27 @@ void KernelGenerator::visit(const ir::operation::Cast &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::Cast::Input::INPUT)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
-  const auto input_sub_type = _ctx.at(ifm_index).typeInfo().type() == ir::DataType::BOOL8
-                                  ? arm_compute::SubDataType::BOOL
-                                  : arm_compute::SubDataType::NONE;
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
 
-  auto fn = std::make_unique<::arm_compute::CLCast>();
+  std::unique_ptr<::arm_compute::IFunction> fn;
+  if (ifm_tensor->data_type() == ofm_tensor->data_type())
+  {
+    auto l = std::make_unique<::arm_compute::CLCopy>();
+
+    l->configure(ifm_tensor->handle(), ofm_tensor->handle());
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), input_sub_type);
+    fn = std::move(l);
+  }
+  else
+  {
+    auto l = std::make_unique<::arm_compute::CLCast>();
+
+    // TODO Support converting float to int32 as round down
+    l->configure(ifm_tensor->handle(), ofm_tensor->handle(), arm_compute::ConvertPolicy::SATURATE);
+
+    fn = std::move(l);
+  }
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -132,10 +145,10 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node)
                                             ker_width, ker_height);
   const auto activation = node.param().activation;
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
-  auto ker_alloc = _tensor_builder->at(ker_index).get();
-  auto bias_alloc = _tensor_builder->at(bias_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+  auto ker_tensor = _tensor_builder->at(ker_index).get();
+  auto bias_tensor = _tensor_builder->at(bias_index).get();
 
   const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
   const auto act_info = acl_common::asActivationLayerInfo(activation);
@@ -143,8 +156,9 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node)
   auto fn = std::make_unique<::arm_compute::CLConvolutionLayer>(
       _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
 
-  fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(), ofm_alloc->handle(),
-                conv_info, ::arm_compute::WeightsInfo(), ::arm_compute::Size2D(1U, 1U), act_info);
+  fn->configure(ifm_tensor->handle(), ker_tensor->handle(), bias_tensor->handle(),
+                ofm_tensor->handle(), conv_info, ::arm_compute::WeightsInfo(),
+                ::arm_compute::Size2D(1U, 1U), act_info);
 
   _return_fn = asAclClFunction(std::move(fn));
 }
@@ -171,10 +185,10 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
   const auto multiplier = node.param().multiplier;
   const auto activation = node.param().activation;
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
-  auto ker_alloc = _tensor_builder->at(ker_index).get();
-  auto bias_alloc = _tensor_builder->at(bias_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+  auto ker_tensor = _tensor_builder->at(ker_index).get();
+  auto bias_tensor = _tensor_builder->at(bias_index).get();
 
   const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
   const auto act_info = acl_common::asActivationLayerInfo(activation);
@@ -182,8 +196,8 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
   {
     auto fn = std::make_unique<::arm_compute::CLDepthwiseConvolutionLayer>();
 
-    fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(),
-                  ofm_alloc->handle(), conv_info, multiplier, act_info);
+    fn->configure(ifm_tensor->handle(), ker_tensor->handle(), bias_tensor->handle(),
+                  ofm_tensor->handle(), conv_info, multiplier, act_info);
 
     _return_fn = asAclClFunction(std::move(fn));
   }
@@ -217,19 +231,20 @@ void KernelGenerator::visit(const ir::operation::MaxPool2D &node)
   VERBOSE(MaxPool2D) << "PAD(L): " << padding.left << std::endl;
   VERBOSE(MaxPool2D) << "PAD(R): " << padding.right << std::endl;
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
 
-  ::arm_compute::PoolingLayerInfo info{::arm_compute::PoolingType::MAX,
-                                       ::arm_compute::Size2D{kw, kh},
-                                       acl_common::asPadStrideInfo(padding, stride)};
+  ::arm_compute::PoolingLayerInfo info{
+      ::arm_compute::PoolingType::MAX, ::arm_compute::Size2D{kw, kh},
+      ifm_tensor->info()->data_layout(), acl_common::asPadStrideInfo(padding, stride)};
 
   auto fn = std::make_unique<::arm_compute::CLPoolingLayer>();
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info);
+  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), info);
 
   _return_fn = std::make_unique<exec::FunctionSequence>(
-      asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
+      asAclClFunction(std::move(fn)),
+      ActivationBuilder::generate(activation, ofm_tensor->handle()));
 }
 
 void KernelGenerator::visit(const ir::operation::AvgPool2D &node)
@@ -260,19 +275,21 @@ void KernelGenerator::visit(const ir::operation::AvgPool2D &node)
   VERBOSE(AvgPool2D) << "PAD(L): " << padding.left << std::endl;
   VERBOSE(AvgPool2D) << "PAD(R): " << padding.right << std::endl;
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
 
   ::arm_compute::PoolingLayerInfo info{
       ::arm_compute::PoolingType::AVG, ::arm_compute::Size2D{kw, kh},
-      acl_common::asPadStrideInfo(padding, stride), true /* exclude_padding */};
+      ifm_tensor->info()->data_layout(), acl_common::asPadStrideInfo(padding, stride),
+      true /* exclude_padding */};
 
   auto fn = std::make_unique<::arm_compute::CLPoolingLayer>();
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info);
+  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), info);
 
   _return_fn = std::make_unique<exec::FunctionSequence>(
-      asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
+      asAclClFunction(std::move(fn)),
+      ActivationBuilder::generate(activation, ofm_tensor->handle()));
 }
 
 void KernelGenerator::visit(const ir::operation::Concat &node)
@@ -296,7 +313,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
     return;
   }
 
-  auto output_alloc = _tensor_builder->at(ofm_index).get();
+  auto output_tensor = _tensor_builder->at(ofm_index).get();
   std::vector<::arm_compute::ICLTensor *> input_tensors;
   for (auto &ifm_ind : input_indexes)
     input_tensors.emplace_back(_tensor_builder->at(ifm_ind)->handle());
@@ -305,7 +322,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
   if (input_indexes.size() < 2)
   {
     auto l = std::make_unique<::arm_compute::CLCopy>();
-    l->configure(input_tensors.at(0), output_alloc->handle());
+    l->configure(input_tensors.at(0), output_tensor->handle());
     fn = std::move(l);
   }
   else
@@ -313,10 +330,10 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
     auto l = std::make_unique<::arm_compute::CLConcatenateLayer>();
     const auto rank = _ctx.at(ofm_index).shape().rank();
     const auto frontend_layout = _current_op_seq_layout;
-    const auto backend_layout = output_alloc->layout();
+    const auto backend_layout = output_tensor->layout();
     const auto fixed_axis =
         acl_common::ToARMComputeAxis(rank, axis, frontend_layout, backend_layout).value();
-    l->configure(input_tensors, output_alloc->handle(), fixed_axis);
+    l->configure(input_tensors, output_tensor->handle(), fixed_axis);
     fn = std::move(l);
   }
 
@@ -327,75 +344,15 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
 
 void KernelGenerator::visit(const ir::operation::FullyConnected &node)
 {
-  using ir::operation::FullyConnected;
-
   const auto output_index{node.getOutputs().at(0)};
-  const auto input_index{node.getInputs().at(FullyConnected::Input::INPUT)};
-  const auto weight_index{node.getInputs().at(FullyConnected::Input::WEIGHT)};
-  const auto bias_index{node.getInputs().at(FullyConnected::Input::BIAS)};
-
-  const auto input_rank = _ctx.at(input_index).shape().rank();
-
-  const auto output_size =
-      _ctx.at(output_index).shape().dim(_ctx.at(output_index).shape().rank() - 1);
-  UNUSED_RELEASE(output_size);
-  assert(_ctx.at(bias_index).shape().dim(0) == output_size);
-  assert(_ctx.at(weight_index).shape().dim(0) == output_size);
-  const auto batch_size =
-      _ctx.at(output_index).shape().dim(_ctx.at(output_index).shape().rank() - 2);
-  const auto input_size =
-      _ctx.at(weight_index).shape().dim(_ctx.at(weight_index).shape().rank() - 1);
-
-  // Check for reshaping input's shape into rank-2
-  bool needs_reshape = false;
-  ir::Shape reshape(2);
-  if (input_rank == 3 || input_rank == 4)
-  {
-    const auto &ifm_shape = _ctx.at(input_index).shape();
-    auto feature_size = 1;
-    for (int i = 0; i < ifm_shape.rank(); ++i)
-    {
-      feature_size *= ifm_shape.dim(i);
-    }
-
-    UNUSED_RELEASE(feature_size);
-    assert(feature_size == batch_size * input_size);
-
-    // for reshaping
-    needs_reshape = true;
-    reshape.dim(0) = batch_size; /* H */
-    reshape.dim(1) = input_size; /* W */
-  }
-
+  auto output_tensor = _tensor_builder->at(output_index).get();
   const auto activation = node.param().activation;
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  const auto input_alloc = _tensor_builder->at(input_index).get();
-  const auto weight_alloc = _tensor_builder->at(weight_index).get();
-  const auto bias_alloc = _tensor_builder->at(bias_index).get();
-  const auto frontend_layout = _current_op_seq_layout;
-  const auto acl_layout = output_alloc->handle()->info()->data_layout();
-
-  auto fn = std::make_unique<arm_compute::CLFullyConnectedReshapingLayer>(
-      _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
-
-  arm_compute::CLFullyConnectedReshapingLayer::KernelType kernel_type =
-      arm_compute::CLFullyConnectedReshapingLayer::KernelType::GENERAL;
-  if (_ctx.at(weight_index).isConstant())
-  {
-    kernel_type = arm_compute::CLFullyConnectedReshapingLayer::KernelType::PREPROCESSED_WEIGHTS;
-    assert(_ctx.at(weight_index).data());
-  }
-  fn->configure(
-      input_alloc->handle(), weight_alloc->handle(), bias_alloc->handle(), output_alloc->handle(),
-      needs_reshape,
-      ::onert::backend::acl_common::asTensorShape(
-          reshape, frontend_layout, ::onert::backend::acl_common::asRuntimeLayout(acl_layout)),
-      kernel_type);
-
+  auto fn = acl_common::kernelGenFullyConnected<acl_common::AclClFunction, ::arm_compute::ICLTensor,
+                                                ::arm_compute::CLFullyConnectedReshapingLayer>(
+      node, _ctx, _tensor_builder, _current_op_seq_layout);
   _return_fn = std::make_unique<exec::FunctionSequence>(
-      asAclClFunction(std::move(fn)),
-      ActivationBuilder::generate(activation, output_alloc->handle()));
+      std::move(fn), ActivationBuilder::generate(activation, output_tensor->handle()));
 }
 
 void KernelGenerator::visit(const ir::operation::Mul &node)
@@ -406,17 +363,18 @@ void KernelGenerator::visit(const ir::operation::Mul &node)
 
   const auto activation = node.param().activation;
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto lhs_alloc = _tensor_builder->at(lhs_index).get();
-  auto rhs_alloc = _tensor_builder->at(rhs_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto lhs_tensor = _tensor_builder->at(lhs_index).get();
+  auto rhs_tensor = _tensor_builder->at(rhs_index).get();
 
   auto fn = std::make_unique<::arm_compute::CLPixelWiseMultiplication>();
 
-  fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(), 1.0, // scale
+  fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), 1.0, // scale
                 arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
 
   _return_fn = std::make_unique<exec::FunctionSequence>(
-      asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
+      asAclClFunction(std::move(fn)),
+      ActivationBuilder::generate(activation, ofm_tensor->handle()));
 }
 
 void KernelGenerator::visit(const ir::operation::Reduce &node)
@@ -427,14 +385,14 @@ void KernelGenerator::visit(const ir::operation::Reduce &node)
   const auto keep_dims{node.param().keep_dims};
   const auto reduce_type = node.param().reduce_type;
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input_tensor = _tensor_builder->at(input_index).get();
 
   // Convert to ACL axes taking into account negative values and possible duplicates.
   const auto &axes = _ctx.at(axes_index);
   const auto input_rank = _ctx.at(input_index).shape().rank();
   const auto frontend_layout = _current_op_seq_layout;
-  const auto backend_layout = input_alloc->layout();
+  const auto backend_layout = input_tensor->layout();
 
   std::unique_ptr<arm_compute::IFunction> fn;
   if (reduce_type == ir::operation::Reduce::ReduceType::MEAN)
@@ -443,7 +401,7 @@ void KernelGenerator::visit(const ir::operation::Reduce &node)
 
     const auto acl_axes =
         acl_common::asCoordinates(axes, input_rank, frontend_layout, backend_layout);
-    l->configure(input_alloc->handle(), acl_axes, keep_dims, output_alloc->handle());
+    l->configure(input_tensor->handle(), acl_axes, keep_dims, output_tensor->handle());
 
     fn = std::move(l);
   }
@@ -453,7 +411,7 @@ void KernelGenerator::visit(const ir::operation::Reduce &node)
         _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
 
     const auto acl_axes = acl_common::asSet(axes, input_rank, frontend_layout, backend_layout);
-    l->configure(input_alloc->handle(), output_alloc->handle(), acl_axes, keep_dims,
+    l->configure(input_tensor->handle(), output_tensor->handle(), acl_axes, keep_dims,
                  acl_common::convertReduceType(reduce_type));
 
     fn = std::move(l);
@@ -469,13 +427,13 @@ void KernelGenerator::visit(const ir::operation::Reshape &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)};
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input_tensor = _tensor_builder->at(input_index).get();
 
   // NOTE This operation must not be changed the layout from frontend to backend
   //      So, PermutationOperationPass makes layouts of frontend and backend the same.
   const auto frontend_layout = _current_op_seq_layout;
-  const auto backend_layout = output_alloc->layout();
+  const auto backend_layout = output_tensor->layout();
   assert((_ctx.at(input_index).shape().rank() < 4 && _ctx.at(output_index).shape().rank() < 4) ||
          frontend_layout == backend_layout);
   UNUSED_RELEASE(frontend_layout);
@@ -483,7 +441,7 @@ void KernelGenerator::visit(const ir::operation::Reshape &node)
 
   auto fn = std::make_unique<::arm_compute::CLReshapeLayer>();
 
-  fn->configure(input_alloc->handle(), output_alloc->handle());
+  fn->configure(input_tensor->handle(), output_tensor->handle());
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -503,10 +461,10 @@ void KernelGenerator::visit(const ir::operation::Squeeze &node)
   (void)dims;
   (void)ndim;
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input_tensor = _tensor_builder->at(input_index).get();
   auto fn = std::make_unique<arm_compute::CLReshapeLayer>();
-  fn->configure(input_alloc->handle(), output_alloc->handle());
+  fn->configure(input_tensor->handle(), output_tensor->handle());
   auto acl_fn = asAclClFunction(std::move(fn));
   _return_fn = std::move(acl_fn);
 }
@@ -516,15 +474,15 @@ void KernelGenerator::visit(const ir::operation::Tanh &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::Tanh::Input::INPUT)};
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input_tensor = _tensor_builder->at(input_index).get();
 
   auto fn = std::make_unique<arm_compute::CLActivationLayer>();
 
   const ::arm_compute::ActivationLayerInfo act_info{
       ::arm_compute::ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f};
 
-  fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
+  fn->configure(input_tensor->handle(), output_tensor->handle(), act_info);
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -538,13 +496,13 @@ void KernelGenerator::visit(const ir::operation::Softmax &node)
 
   const auto beta = node.param().beta;
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input_tensor = _tensor_builder->at(input_index).get();
 
   auto fn = std::make_unique<::arm_compute::CLSoftmaxLayer>(
       _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
 
-  fn->configure(input_alloc->handle(), output_alloc->handle(), beta);
+  fn->configure(input_tensor->handle(), output_tensor->handle(), beta);
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -558,10 +516,10 @@ void KernelGenerator::visit(const ir::operation::Slice &node)
   const auto begins_index{node.getInputs().at(ir::operation::Slice::Input::BEGINS)};
   const auto sizes_index{node.getInputs().at(ir::operation::Slice::Input::SIZES)};
 
-  auto outputData_alloc = _tensor_builder->at(output_index).get();
-  auto inputData_alloc = _tensor_builder->at(input_index).get();
+  auto outputData_tensor = _tensor_builder->at(output_index).get();
+  auto inputData_tensor = _tensor_builder->at(input_index).get();
   const auto frontend_layout = _current_op_seq_layout;
-  const auto backend_layout = inputData_alloc->layout();
+  const auto backend_layout = inputData_tensor->layout();
 
   // Set initializers for indices data such as order of inputData
   int input_rank = _ctx.at(input_index).shape().rank();
@@ -613,7 +571,7 @@ void KernelGenerator::visit(const ir::operation::Slice &node)
 
   auto fn = std::make_unique<::arm_compute::CLSlice>();
 
-  fn->configure(inputData_alloc->handle(), outputData_alloc->handle(), starts_set, ends_set);
+  fn->configure(inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set);
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -628,10 +586,10 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node)
   const auto ends_index{node.getInputs().at(ir::operation::StridedSlice::Input::ENDS)};
   const auto strides_index{node.getInputs().at(ir::operation::StridedSlice::Input::STRIDES)};
 
-  auto outputData_alloc = _tensor_builder->at(output_index).get();
-  auto inputData_alloc = _tensor_builder->at(input_index).get();
+  auto outputData_tensor = _tensor_builder->at(output_index).get();
+  auto inputData_tensor = _tensor_builder->at(input_index).get();
   const auto frontend_layout = _current_op_seq_layout;
-  const auto backend_layout = inputData_alloc->layout();
+  const auto backend_layout = inputData_tensor->layout();
 
   // Set initializers for indices data such as order of inputData
   int input_rank = _ctx.at(input_index).shape().rank();
@@ -704,7 +662,7 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node)
 
   auto fn = std::make_unique<::arm_compute::CLStridedSlice>();
 
-  fn->configure(inputData_alloc->handle(), outputData_alloc->handle(), starts_set, ends_set,
+  fn->configure(inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set,
                 strides_set, begin_mask, end_mask, shrink_axis_mask);
 
   auto acl_fn = asAclClFunction(std::move(fn));
@@ -720,10 +678,10 @@ void KernelGenerator::visit(const ir::operation::Transpose &node)
 
   const auto rank = _ctx.at(ifm_idx).shape().rank();
 
-  auto ofm_alloc = _tensor_builder->at(ofm_idx).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_idx).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_idx).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_idx).get();
   const auto frontend_layout = _current_op_seq_layout;
-  const auto backend_layout = ifm_alloc->layout();
+  const auto backend_layout = ifm_tensor->layout();
 
   std::vector<std::int32_t> pv(perm.cbegin(), perm.cend());
   // Reversed
@@ -732,7 +690,7 @@ void KernelGenerator::visit(const ir::operation::Transpose &node)
 
   auto fn = std::make_unique<::arm_compute::CLPermute>();
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), backend_pv);
+  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), backend_pv);
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -747,17 +705,18 @@ void KernelGenerator::visit(const ir::operation::Add &node)
 
   const auto activation = node.param().activation;
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto lhs_alloc = _tensor_builder->at(lhs_index).get();
-  auto rhs_alloc = _tensor_builder->at(rhs_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto lhs_tensor = _tensor_builder->at(lhs_index).get();
+  auto rhs_tensor = _tensor_builder->at(rhs_index).get();
 
   auto fn = std::make_unique<::arm_compute::CLArithmeticAddition>();
 
-  fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(),
+  fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
                 arm_compute::ConvertPolicy::SATURATE);
 
   _return_fn = std::make_unique<exec::FunctionSequence>(
-      asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
+      asAclClFunction(std::move(fn)),
+      ActivationBuilder::generate(activation, ofm_tensor->handle()));
 }
 
 void KernelGenerator::visit(const ir::operation::Sub &node)
@@ -768,17 +727,18 @@ void KernelGenerator::visit(const ir::operation::Sub &node)
 
   const auto activation = node.param().activation;
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto lhs_alloc = _tensor_builder->at(lhs_index).get();
-  auto rhs_alloc = _tensor_builder->at(rhs_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto lhs_tensor = _tensor_builder->at(lhs_index).get();
+  auto rhs_tensor = _tensor_builder->at(rhs_index).get();
 
   auto fn = std::make_unique<::arm_compute::CLArithmeticSubtraction>();
 
-  fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(),
+  fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
                 arm_compute::ConvertPolicy::SATURATE);
 
   _return_fn = std::make_unique<exec::FunctionSequence>(
-      asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
+      asAclClFunction(std::move(fn)),
+      ActivationBuilder::generate(activation, ofm_tensor->handle()));
 }
 
 void KernelGenerator::visit(const ir::operation::Div &node)
@@ -789,16 +749,17 @@ void KernelGenerator::visit(const ir::operation::Div &node)
 
   const auto activation = node.param().activation;
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto lhs_alloc = _tensor_builder->at(lhs_index).get();
-  auto rhs_alloc = _tensor_builder->at(rhs_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto lhs_tensor = _tensor_builder->at(lhs_index).get();
+  auto rhs_tensor = _tensor_builder->at(rhs_index).get();
 
   auto fn = std::make_unique<::arm_compute::CLArithmeticDivision>();
 
-  fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
+  fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
 
   _return_fn = std::make_unique<exec::FunctionSequence>(
-      asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
+      asAclClFunction(std::move(fn)),
+      ActivationBuilder::generate(activation, ofm_tensor->handle()));
 }
 
 void KernelGenerator::visit(const ir::operation::Exp &node)
@@ -806,12 +767,12 @@ void KernelGenerator::visit(const ir::operation::Exp &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::Exp::Input::INPUT)};
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input_tensor = _tensor_builder->at(input_index).get();
 
   auto fn = std::make_unique<::arm_compute::CLExpLayer>();
 
-  fn->configure(input_alloc->handle(), output_alloc->handle());
+  fn->configure(input_tensor->handle(), output_tensor->handle());
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -823,12 +784,12 @@ void KernelGenerator::visit(const ir::operation::ExpandDims &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::ExpandDims::Input::INPUT)};
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input_tensor = _tensor_builder->at(input_index).get();
 
   auto fn = std::make_unique<::arm_compute::CLReshapeLayer>();
 
-  fn->configure(input_alloc->handle(), output_alloc->handle());
+  fn->configure(input_tensor->handle(), output_tensor->handle());
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -842,20 +803,21 @@ void KernelGenerator::visit(const ir::operation::InstanceNorm &node)
   const auto gamma_index{node.getInputs().at(ir::operation::InstanceNorm::Input::GAMMA)};
   const auto beta_index{node.getInputs().at(ir::operation::InstanceNorm::Input::BETA)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
-  auto gamma_alloc = _tensor_builder->at(gamma_index).get();
-  auto beta_alloc = _tensor_builder->at(beta_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+  auto gamma_tensor = _tensor_builder->at(gamma_index).get();
+  auto beta_tensor = _tensor_builder->at(beta_index).get();
   auto epsilon = node.param().epsilon;
   auto activation = node.param().activation;
 
   auto fn = std::make_unique<::arm_compute::CLInstanceNormalizationLayerEx>();
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), gamma_alloc->handle(),
-                beta_alloc->handle(), epsilon);
+  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), gamma_tensor->handle(),
+                beta_tensor->handle(), epsilon);
 
   _return_fn = std::make_unique<exec::FunctionSequence>(
-      asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
+      asAclClFunction(std::move(fn)),
+      ActivationBuilder::generate(activation, ofm_tensor->handle()));
 }
 
 void KernelGenerator::visit(const ir::operation::Logistic &node)
@@ -863,15 +825,15 @@ void KernelGenerator::visit(const ir::operation::Logistic &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::Logistic::Input::INPUT)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
 
   const ::arm_compute::ActivationLayerInfo act_info{
       ::arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC};
 
   auto fn = std::make_unique<::arm_compute::CLActivationLayer>();
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info);
+  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), act_info);
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -884,13 +846,13 @@ void KernelGenerator::visit(const ir::operation::LogicalAnd &node)
   const auto input0_index{node.getInputs().at(ir::operation::LogicalAnd::Input::INPUT0)};
   const auto input1_index{node.getInputs().at(ir::operation::LogicalAnd::Input::INPUT1)};
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input0_alloc = _tensor_builder->at(input0_index).get();
-  auto input1_alloc = _tensor_builder->at(input1_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input0_tensor = _tensor_builder->at(input0_index).get();
+  auto input1_tensor = _tensor_builder->at(input1_index).get();
 
   auto fn = std::make_unique<::arm_compute::CLBinaryLogicalOp>();
 
-  fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle(),
+  fn->configure(input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle(),
                 ::arm_compute::BinaryLogicalOperation::AND);
 
   auto acl_fn = asAclClFunction(std::move(fn));
@@ -900,159 +862,8 @@ void KernelGenerator::visit(const ir::operation::LogicalAnd &node)
 
 void KernelGenerator::visit(const ir::operation::LSTM &node)
 {
-  // TODO Support dynamic rnn
-  // TODO Fix subtle error in the case of non-CIFG, non-peephole and No Projection.
-  const auto scratch_buffer_index{
-      node.getOutputs().at(ir::operation::LSTM::Output::SCRATCH_BUFFER)};
-  const auto output_state_out_index{
-      node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT_STATE_OUT)};
-  const auto cell_state_out_index{
-      node.getOutputs().at(ir::operation::LSTM::Output::CELL_STATE_OUT)};
-  const auto output_index{node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT)};
-
-  const auto input_index{node.getInputs().at(ir::operation::LSTM::Input::INPUT)};
-  const auto input_to_input_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_INPUT_WEIGHTS)}; // optional
-  const auto input_to_forget_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_FORGET_WEIGHTS)};
-  const auto input_to_cell_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_CELL_WEIGHTS)};
-  const auto input_to_output_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_OUTPUT_WEIGHTS)};
-  const auto recurrent_to_input_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_INPUT_WEIGHTS)}; // optional
-  const auto recurrent_to_forget_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_FORGET_WEIGHTS)};
-  const auto recurrent_to_cell_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_CELL_WEIGHTS)};
-  const auto recurrent_to_output_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_OUTPUT_WEIGHTS)};
-  const auto cell_to_input_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_INPUT_WEIGHTS)}; // optional
-  const auto cell_to_forget_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_FORGET_WEIGHTS)}; // optional
-  const auto cell_to_output_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_OUTPUT_WEIGHTS)}; // optional
-  const auto input_gate_bias_index{
-      node.getInputs().at(ir::operation::LSTM::Input::INPUT_GATE_BIAS)};
-  const auto forget_gate_bias_index{
-      node.getInputs().at(ir::operation::LSTM::Input::FORGET_GATE_BIAS)};
-  const auto cell_bias_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_BIAS)};
-  const auto output_gate_bias_index{
-      node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_GATE_BIAS)};
-  const auto projection_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_WEIGHTS)}; // optional
-  const auto projection_bias_index{
-      node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_BIAS)}; // optional
-  const auto output_state_in_index{
-      node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_STATE_IN)};
-  const auto cell_state_in_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_STATE_IN)};
-  const auto cell_threshold = node.param().cell_threshold;
-  const auto projection_threshold = node.param().projection_threshold;
-
-  bool has_input_to_input_weights = _ctx.at(input_to_input_weights_index).shape().dim(0) != 0 &&
-                                    _ctx.at(input_to_input_weights_index).shape().dim(1) != 0;
-  bool has_recurrent_to_input_weights =
-      _ctx.at(recurrent_to_input_weights_index).shape().dim(0) != 0 &&
-      _ctx.at(recurrent_to_input_weights_index).shape().dim(1) != 0;
-  bool has_cell_to_forget_weights = _ctx.at(cell_to_forget_weights_index).shape().dim(0) != 0;
-  bool has_cell_to_output_weights = _ctx.at(cell_to_output_weights_index).shape().dim(0) != 0;
-  bool has_projection_weights = _ctx.at(projection_weights_index).shape().dim(0) != 0 &&
-                                _ctx.at(projection_weights_index).shape().dim(1) != 0;
-  bool has_projection_bias = _ctx.at(projection_bias_index).shape().dim(0);
-
-  // NOTE The input_to_input_weights and the recurrent_to_input_weights do not exist in CIFG.
-  // true: no CIFG
-  // false: CIFG
-  // NOTE The cell_to_input_weights does not exist in non-peephole although regular LSTM(non-CIFG).
-  bool has_cifg_param = has_input_to_input_weights && has_recurrent_to_input_weights;
-
-  // NOTE The cell_to_forget_weights and the cell_to_output_weights exist in peephole.
-  // But the cell_to_input_weights does not exist in regular CIFG although peephole.
-  // true: peephole
-  // false: no peephole
-  bool has_peephole_param = has_cell_to_forget_weights && has_cell_to_output_weights;
-
-  // NOTE Although the projection weights has data the projection bias may not have data.
-  bool has_projection_param = has_projection_weights;
-
-  const auto activation = node.param().activation;
-  const auto cell_clip = cell_threshold;
-  const auto projection_clip = projection_threshold;
-  assert(cell_clip >= 0.f && projection_clip >= 0.f);
-
-  auto scratch_buffer_alloc = _tensor_builder->at(scratch_buffer_index).get();
-  auto output_state_out_alloc = _tensor_builder->at(output_state_out_index).get();
-  auto cell_state_out_alloc = _tensor_builder->at(cell_state_out_index).get();
-  auto output_alloc = _tensor_builder->at(output_index).get();
-
-  auto input_alloc = _tensor_builder->at(input_index).get();
-
-  auto input_to_forget_weights_alloc = _tensor_builder->at(input_to_forget_weights_index).get();
-  auto input_to_cell_weights_alloc = _tensor_builder->at(input_to_cell_weights_index).get();
-  auto input_to_output_weights_alloc = _tensor_builder->at(input_to_output_weights_index).get();
-  auto recurrent_to_forget_weights_alloc =
-      _tensor_builder->at(recurrent_to_forget_weights_index).get();
-  auto recurrent_to_cell_weights_alloc = _tensor_builder->at(recurrent_to_cell_weights_index).get();
-  auto recurrent_to_output_weights_alloc =
-      _tensor_builder->at(recurrent_to_output_weights_index).get();
-
-  auto forget_gate_bias_alloc = _tensor_builder->at(forget_gate_bias_index).get();
-  auto cell_bias_alloc = _tensor_builder->at(cell_bias_index).get();
-  auto output_gate_bias_alloc = _tensor_builder->at(output_gate_bias_index).get();
-  auto output_state_in_alloc = _tensor_builder->at(output_state_in_index).get();
-  auto cell_state_in_alloc = _tensor_builder->at(cell_state_in_index).get();
-
-  auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation);
-
-  auto fn = std::make_unique<::arm_compute::CLLSTMLayer>();
-
-  ::arm_compute::LSTMParams<::arm_compute::ICLTensor> lstm_params{};
-  if (has_cifg_param)
-  {
-    auto input_to_input_weights_alloc =
-        _tensor_builder->at(input_to_input_weights_index).get(); // optional
-    auto recurrent_to_input_weights_alloc =
-        _tensor_builder->at(recurrent_to_input_weights_index).get(); // optional
-    auto cell_to_input_weights_handle =
-        has_peephole_param ? _tensor_builder->at(cell_to_input_weights_index).get()->handle()
-                           : nullptr; // optional (non-cifg && peephole)
-    auto input_gate_bias_alloc = _tensor_builder->at(input_gate_bias_index).get(); // optional
-    lstm_params.set_cifg_params(input_to_input_weights_alloc->handle(),
-                                recurrent_to_input_weights_alloc->handle(),
-                                cell_to_input_weights_handle, input_gate_bias_alloc->handle());
-  }
-  if (has_peephole_param)
-  {
-    auto cell_to_forget_weights_alloc =
-        _tensor_builder->at(cell_to_forget_weights_index).get(); // optional
-    auto cell_to_output_weights_alloc =
-        _tensor_builder->at(cell_to_output_weights_index).get(); // optional
-    lstm_params.set_peephole_params(cell_to_forget_weights_alloc->handle(),
-                                    cell_to_output_weights_alloc->handle());
-  }
-  if (has_projection_param)
-  {
-    auto projection_weights_alloc = _tensor_builder->at(projection_weights_index).get(); // optional
-    auto projection_bias_handle = has_projection_bias
-                                      ? _tensor_builder->at(projection_bias_index).get()->handle()
-                                      : nullptr; // optional
-    lstm_params.set_projection_params(projection_weights_alloc->handle(), projection_bias_handle);
-  }
-
-  fn->configure(
-      input_alloc->handle(), input_to_forget_weights_alloc->handle(),
-      input_to_cell_weights_alloc->handle(), input_to_output_weights_alloc->handle(),
-      recurrent_to_forget_weights_alloc->handle(), recurrent_to_cell_weights_alloc->handle(),
-      recurrent_to_output_weights_alloc->handle(), forget_gate_bias_alloc->handle(),
-      cell_bias_alloc->handle(), output_gate_bias_alloc->handle(), output_state_in_alloc->handle(),
-      cell_state_in_alloc->handle(), scratch_buffer_alloc->handle(),
-      output_state_out_alloc->handle(), cell_state_out_alloc->handle(), output_alloc->handle(),
-      lstm_params, act_info, cell_clip, projection_clip);
-
-  auto acl_fn = asAclClFunction(std::move(fn));
-
-  _return_fn = std::move(acl_fn);
+  _return_fn = acl_common::kernelGenLSTM<acl_common::AclClFunction, ::arm_compute::ICLTensor,
+                                         ::arm_compute::CLLSTMLayer>(node, _ctx, _tensor_builder);
 }
 
 void KernelGenerator::visit(const ir::operation::Comparison &node)
@@ -1063,13 +874,13 @@ void KernelGenerator::visit(const ir::operation::Comparison &node)
 
   const auto comparison_type = node.param().comparison_type;
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input0_alloc = _tensor_builder->at(input0_index).get();
-  auto input1_alloc = _tensor_builder->at(input1_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input0_tensor = _tensor_builder->at(input0_index).get();
+  auto input1_tensor = _tensor_builder->at(input1_index).get();
 
   auto fn = std::make_unique<::arm_compute::CLComparison>();
 
-  fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle(),
+  fn->configure(input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle(),
                 (arm_compute::ComparisonOperation)comparison_type);
 
   auto acl_fn = asAclClFunction(std::move(fn));
@@ -1107,13 +918,13 @@ void KernelGenerator::visit(const ir::operation::Pack &node)
   for (const auto &input_index : input_indexes)
   {
     size_t input_rank = _ctx.at(input_index).shape().rank();
-    const auto &input_alloc = _tensor_builder->at(input_index);
-    orig_inputs_acl_tensor_shapes.emplace_back(input_alloc->info()->tensor_shape());
-    assert(input_rank == input_alloc->num_dimensions());
-    if (input_rank != input_alloc->info()->num_dimensions())
+    const auto &input_tensor = _tensor_builder->at(input_index);
+    orig_inputs_acl_tensor_shapes.emplace_back(input_tensor->info()->tensor_shape());
+    assert(input_rank == input_tensor->num_dimensions());
+    if (input_rank != input_tensor->info()->num_dimensions())
     {
       // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
-      input_alloc->info()->set_tensor_shape(acl_common::asTensorShape(
+      input_tensor->info()->set_tensor_shape(acl_common::asTensorShape(
           _ctx.at(input_index).shape(), _current_op_seq_layout, backend_layout, false));
     }
   }
@@ -1135,8 +946,8 @@ void KernelGenerator::visit(const ir::operation::Permute &node)
   const auto ofm_idx{node.getOutputs().at(0)};
   const auto ifm_idx{node.getInputs().at(0)};
   const auto permute_type = node.getPermuteType();
-  auto ofm_alloc = _tensor_builder->at(ofm_idx).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_idx).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_idx).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_idx).get();
   const auto rank = _ctx.at(ofm_idx).shape().rank();
   assert(_ctx.at(ifm_idx).shape().rank() == _ctx.at(ofm_idx).shape().rank());
 
@@ -1149,7 +960,7 @@ void KernelGenerator::visit(const ir::operation::Permute &node)
 
     auto l = std::make_unique<::arm_compute::CLPermute>();
 
-    l->configure(ifm_alloc->handle(), ofm_alloc->handle(), pv);
+    l->configure(ifm_tensor->handle(), ofm_tensor->handle(), pv);
 
     fn = std::move(l);
   }
@@ -1160,7 +971,7 @@ void KernelGenerator::visit(const ir::operation::Permute &node)
 
     auto l = std::make_unique<::arm_compute::CLPermute>();
 
-    l->configure(ifm_alloc->handle(), ofm_alloc->handle(), pv);
+    l->configure(ifm_tensor->handle(), ofm_tensor->handle(), pv);
 
     fn = std::move(l);
   }
@@ -1168,7 +979,7 @@ void KernelGenerator::visit(const ir::operation::Permute &node)
   {
     auto l = std::make_unique<::arm_compute::CLCopy>();
 
-    l->configure(ifm_alloc->handle(), ofm_alloc->handle());
+    l->configure(ifm_tensor->handle(), ofm_tensor->handle());
 
     fn = std::move(l);
   }
@@ -1183,12 +994,12 @@ void KernelGenerator::visit(const ir::operation::RSQRT &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::RSQRT::Input::INPUT)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
 
   auto fn = std::make_unique<::arm_compute::CLRsqrtLayer>();
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle());
+  fn->configure(ifm_tensor->handle(), ofm_tensor->handle());
 
   _return_fn = asAclClFunction(std::move(fn));
 }
@@ -1198,15 +1009,15 @@ void KernelGenerator::visit(const ir::operation::ReLU &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::ReLU::Input::INPUT)};
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input_tensor = _tensor_builder->at(input_index).get();
 
   auto fn = std::make_unique<arm_compute::CLActivationLayer>();
 
   const ::arm_compute::ActivationLayerInfo act_info{
       ::arm_compute::ActivationLayerInfo::ActivationFunction::RELU};
 
-  fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
+  fn->configure(input_tensor->handle(), output_tensor->handle(), act_info);
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -1219,12 +1030,12 @@ void KernelGenerator::visit(const ir::operation::ResizeBilinear &node)
 
   const auto ifm_index{node.getInputs().at(ir::operation::ResizeBilinear::Input::INPUT)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
 
   auto fn = std::make_unique<::arm_compute::CLScale>();
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(),
+  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(),
                 ::arm_compute::InterpolationPolicy::BILINEAR, ::arm_compute::BorderMode::REPLICATE,
                 ::arm_compute::PixelValue(0.f), ::arm_compute::SamplingPolicy::TOP_LEFT);
 
@@ -1238,15 +1049,15 @@ void KernelGenerator::visit(const ir::operation::ReLU1 &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::ReLU1::Input::INPUT)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
 
   const ::arm_compute::ActivationLayerInfo act_info{
       ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1.0f, -1.0f};
 
   auto fn = std::make_unique<::arm_compute::CLActivationLayer>();
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info);
+  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), act_info);
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -1258,15 +1069,15 @@ void KernelGenerator::visit(const ir::operation::ReLU6 &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::ReLU6::Input::INPUT)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
 
   const ::arm_compute::ActivationLayerInfo act_info{
       ::arm_compute::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.0f};
 
   auto fn = std::make_unique<::arm_compute::CLActivationLayer>();
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info);
+  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), act_info);
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -1288,25 +1099,25 @@ void KernelGenerator::visit(const ir::operation::RNN &node)
 
   const auto activation = node.param().activation;
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto hidden_state_out_alloc = _tensor_builder->at(hidden_state_out_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto hidden_state_out_tensor = _tensor_builder->at(hidden_state_out_index).get();
 
-  auto input_alloc = _tensor_builder->at(input_index).get();
-  auto weights_alloc = _tensor_builder->at(weights_index).get();
-  auto recurrent_weights_alloc = _tensor_builder->at(recurrent_weights_index).get();
-  auto bias_alloc = _tensor_builder->at(bias_index).get();
-  auto hidden_state_in_alloc = _tensor_builder->at(hidden_state_in_index).get();
+  auto input_tensor = _tensor_builder->at(input_index).get();
+  auto weights_tensor = _tensor_builder->at(weights_index).get();
+  auto recurrent_weights_tensor = _tensor_builder->at(recurrent_weights_index).get();
+  auto bias_tensor = _tensor_builder->at(bias_index).get();
+  auto hidden_state_in_tensor = _tensor_builder->at(hidden_state_in_index).get();
   auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation);
 
   auto copy_layer = std::make_unique<::arm_compute::CLCopy>();
-  copy_layer->configure(hidden_state_in_alloc->handle(), hidden_state_out_alloc->handle());
+  copy_layer->configure(hidden_state_in_tensor->handle(), hidden_state_out_tensor->handle());
   _return_fn = asAclClFunction(std::move(copy_layer));
 
-  auto fn = std::make_unique<::arm_compute::CLRNNLayerEx>(
+  auto fn = std::make_unique<::arm_compute::CLRNNLayer>(
       _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
-  fn->configure(input_alloc->handle(), weights_alloc->handle(), recurrent_weights_alloc->handle(),
-                bias_alloc->handle(), hidden_state_out_alloc->handle(), output_alloc->handle(),
-                act_info);
+  fn->configure(input_tensor->handle(), weights_tensor->handle(),
+                recurrent_weights_tensor->handle(), bias_tensor->handle(),
+                hidden_state_out_tensor->handle(), output_tensor->handle(), act_info);
   _return_fn = asAclClFunction(std::move(fn));
 }
 
@@ -1315,12 +1126,12 @@ void KernelGenerator::visit(const ir::operation::Floor &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::Floor::Input::INPUT)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
 
   auto fn = std::make_unique<::arm_compute::CLFloor>();
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle());
+  fn->configure(ifm_tensor->handle(), ofm_tensor->handle());
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -1335,10 +1146,10 @@ void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node)
       node.getInputs().at(ir::operation::SpaceToBatchND::Input::BLOCK_SIZE)};
   const auto paddings_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::PADDINGS)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
-  auto block_size_alloc = _tensor_builder->at(block_size_index).get();
-  auto paddings_alloc = _tensor_builder->at(paddings_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+  auto block_size_tensor = _tensor_builder->at(block_size_index).get();
+  auto paddings_tensor = _tensor_builder->at(paddings_index).get();
 
   assert(_ctx.at(block_size_index).data());
   assert(_ctx.at(paddings_index).data());
@@ -1346,8 +1157,8 @@ void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node)
   std::unique_ptr<::arm_compute::IFunction> fn;
 
   auto l = std::make_unique<::arm_compute::CLSpaceToBatchLayer>();
-  l->configure(ifm_alloc->handle(), block_size_alloc->handle(), paddings_alloc->handle(),
-               ofm_alloc->handle());
+  l->configure(ifm_tensor->handle(), block_size_tensor->handle(), paddings_tensor->handle(),
+               ofm_tensor->handle());
   fn = std::move(l);
 
   auto acl_fn = asAclClFunction(std::move(fn));
@@ -1362,12 +1173,12 @@ void KernelGenerator::visit(const ir::operation::SpaceToDepth &node)
 
   auto block_size = node.param().block_size;
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
 
-  auto fn = std::make_unique<::arm_compute::CLSpaceToDepth>();
+  auto fn = std::make_unique<::arm_compute::CLSpaceToDepthLayer>();
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), block_size);
+  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), block_size);
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -1389,19 +1200,21 @@ void KernelGenerator::visit(const ir::operation::L2Pool2D &node)
       ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh);
   const auto activation = node.param().activation;
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
 
   ::arm_compute::PoolingLayerInfo info{
       ::arm_compute::PoolingType::L2, ::arm_compute::Size2D{kw, kh},
+      ifm_tensor->info()->data_layout(),
       ::onert::backend::acl_common::asPadStrideInfo(padding, stride)};
 
   auto fn = std::make_unique<::arm_compute::CLPoolingLayer>();
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info);
+  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), info);
 
   _return_fn = std::make_unique<exec::FunctionSequence>(
-      asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
+      asAclClFunction(std::move(fn)),
+      ActivationBuilder::generate(activation, ofm_tensor->handle()));
 }
 
 void KernelGenerator::visit(const ir::operation::EmbeddingLookup &node)
@@ -1410,13 +1223,13 @@ void KernelGenerator::visit(const ir::operation::EmbeddingLookup &node)
   const auto lookups_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::LOOKUPS)};
   const auto values_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::VALUES)};
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto lookups_alloc = _tensor_builder->at(lookups_index).get();
-  auto values_alloc = _tensor_builder->at(values_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto lookups_tensor = _tensor_builder->at(lookups_index).get();
+  auto values_tensor = _tensor_builder->at(values_index).get();
 
   auto fn = std::make_unique<::arm_compute::CLEmbeddingLookup>();
 
-  fn->configure(values_alloc->handle(), output_alloc->handle(), lookups_alloc->handle());
+  fn->configure(values_tensor->handle(), output_tensor->handle(), lookups_tensor->handle());
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -1442,15 +1255,15 @@ void KernelGenerator::visit(const ir::operation::L2Normalization &node)
   float beta = 0.5f;                             // pow(reduction, -0.5) = 1 / sqrt(reduction)
   float bias = 0.0f;                             // Don't offset the reduction.
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
 
   const auto norm_info = ::arm_compute::NormalizationLayerInfo(::arm_compute::NormType::CROSS_MAP,
                                                                radius, alpha, beta, bias, false);
 
   auto fn = std::make_unique<::arm_compute::CLNormalizationLayer>();
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), norm_info);
+  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -1466,17 +1279,17 @@ void KernelGenerator::visit(const ir::operation::HashtableLookup &node)
   const auto keys_index{node.getInputs().at(ir::operation::HashtableLookup::Input::KEYS)};
   const auto values_index{node.getInputs().at(ir::operation::HashtableLookup::Input::VALUES)};
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto hits_alloc = _tensor_builder->at(hits_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto hits_tensor = _tensor_builder->at(hits_index).get();
 
-  auto lookups_alloc = _tensor_builder->at(lookups_index).get();
-  auto keys_alloc = _tensor_builder->at(keys_index).get();
-  auto values_alloc = _tensor_builder->at(values_index).get();
+  auto lookups_tensor = _tensor_builder->at(lookups_index).get();
+  auto keys_tensor = _tensor_builder->at(keys_index).get();
+  auto values_tensor = _tensor_builder->at(values_index).get();
 
   auto fn = std::make_unique<::arm_compute::CLHashtableLookup>();
 
-  fn->configure(lookups_alloc->handle(), keys_alloc->handle(), values_alloc->handle(),
-                output_alloc->handle(), hits_alloc->handle());
+  fn->configure(lookups_tensor->handle(), keys_tensor->handle(), values_tensor->handle(),
+                output_tensor->handle(), hits_tensor->handle());
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -1489,13 +1302,13 @@ void KernelGenerator::visit(const ir::operation::PReLU &node)
   const auto ifm_index{node.getInputs().at(ir::operation::PReLU::Input::INPUT)};
   const auto alpha_index{node.getInputs().at(ir::operation::PReLU::Input::ALPHA)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
-  auto alpha_alloc = _tensor_builder->at(alpha_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+  auto alpha_tensor = _tensor_builder->at(alpha_index).get();
 
-  auto fn = std::make_unique<::arm_compute::CLPReLU>();
+  auto fn = std::make_unique<::arm_compute::CLPReluLayer>();
 
-  fn->configure(ifm_alloc->handle(), alpha_alloc->handle(), ofm_alloc->handle());
+  fn->configure(ifm_tensor->handle(), alpha_tensor->handle(), ofm_tensor->handle());
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -1518,7 +1331,6 @@ void KernelGenerator::visit(const ir::operation::TransposeConv &node)
          (node.param().padding.type == ir::PaddingType::VALID));
   auto padding = ir::calculatePadding(node.param().padding, ofm_shape, ifm_shape, stride,
                                       ker_shape.W, ker_shape.H);
-
   uint32_t invalid_horizontal = 0;
   uint32_t invalid_vertical = 0;
   if (node.param().padding.type == ir::PaddingType::VALID)
@@ -1528,17 +1340,17 @@ void KernelGenerator::visit(const ir::operation::TransposeConv &node)
     invalid_vertical = ofm_shape.H - (1 + (ifm_shape.H - 1) * stride.vertical) - (ker_shape.H - 1);
   }
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
-  auto ker_alloc = _tensor_builder->at(ker_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+  auto ker_tensor = _tensor_builder->at(ker_index).get();
 
   const auto tconv_info = acl_common::asPadStrideInfo(padding, stride);
 
   auto fn = std::make_unique<::arm_compute::CLTransposeConvLayer>(
       _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
 
-  fn->configure(ifm_alloc->handle(), ker_alloc->handle(), nullptr, ofm_alloc->handle(), tconv_info,
-                invalid_horizontal, invalid_vertical);
+  fn->configure(ifm_tensor->handle(), ker_tensor->handle(), nullptr, ofm_tensor->handle(),
+                tconv_info, invalid_horizontal, invalid_vertical);
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -1550,15 +1362,15 @@ void KernelGenerator::visit(const ir::operation::SQRT &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::SQRT::Input::INPUT)};
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input_tensor = _tensor_builder->at(input_index).get();
 
   const ::arm_compute::ActivationLayerInfo act_info{
       ::arm_compute::ActivationLayerInfo::ActivationFunction::SQRT};
 
   auto fn = std::make_unique<::arm_compute::CLActivationLayer>();
 
-  fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
+  fn->configure(input_tensor->handle(), output_tensor->handle(), act_info);
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -1571,13 +1383,13 @@ void KernelGenerator::visit(const ir::operation::LogicalOr &node)
   const auto input0_index{node.getInputs().at(ir::operation::LogicalOr::Input::INPUT0)};
   const auto input1_index{node.getInputs().at(ir::operation::LogicalOr::Input::INPUT1)};
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input0_alloc = _tensor_builder->at(input0_index).get();
-  auto input1_alloc = _tensor_builder->at(input1_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input0_tensor = _tensor_builder->at(input0_index).get();
+  auto input1_tensor = _tensor_builder->at(input1_index).get();
 
   auto fn = std::make_unique<::arm_compute::CLBitwiseOr>();
 
-  fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle());
+  fn->configure(input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle());
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -1589,12 +1401,12 @@ void KernelGenerator::visit(const ir::operation::LogicalNot &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::LogicalNot::Input::INPUT)};
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input_tensor = _tensor_builder->at(input_index).get();
 
   auto fn = std::make_unique<::arm_compute::CLBitwiseNot>();
 
-  fn->configure(input_alloc->handle(), output_alloc->handle());
+  fn->configure(input_tensor->handle(), output_tensor->handle());
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -1607,13 +1419,13 @@ void KernelGenerator::visit(const ir::operation::SquaredDifference &node)
   const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)};
   const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto lhs_alloc = _tensor_builder->at(lhs_index).get();
-  auto rhs_alloc = _tensor_builder->at(rhs_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto lhs_tensor = _tensor_builder->at(lhs_index).get();
+  auto rhs_tensor = _tensor_builder->at(rhs_index).get();
 
   auto fn = std::make_unique<::arm_compute::CLElementwiseSquaredDiff>();
 
-  fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
+  fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -1634,13 +1446,13 @@ void KernelGenerator::visit(const ir::operation::TopKV2 &node)
 
   const auto k = node.param().k;
 
-  auto values_alloc = _tensor_builder->at(outputValues_index).get();
-  auto indices_alloc = _tensor_builder->at(outputIndices_index).get();
-  auto input_alloc = _tensor_builder->at(inputData_index).get();
+  auto values_tensor = _tensor_builder->at(outputValues_index).get();
+  auto indices_tensor = _tensor_builder->at(outputIndices_index).get();
+  auto input_tensor = _tensor_builder->at(inputData_index).get();
 
   auto fn = std::make_unique<::arm_compute::CLTopKV2>();
 
-  fn->configure(input_alloc->handle(), k, values_alloc->handle(), indices_alloc->handle());
+  fn->configure(input_tensor->handle(), k, values_tensor->handle(), indices_tensor->handle());
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -1659,9 +1471,9 @@ void KernelGenerator::visit(const ir::operation::Gather &node)
   const auto axis_value = (axis_raw < 0 ? (ifm_rank + axis_raw) : axis_raw);
   const int axis = ::onert::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value).value();
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
-  auto indices_alloc = _tensor_builder->at(indices_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+  auto indices_tensor = _tensor_builder->at(indices_index).get();
 
   // NOTE The frontend layout and backend layout must be the same for this operation.
   //      If not the same, we have to add a stage(?) to perform permutation of output tensor. It
@@ -1671,43 +1483,43 @@ void KernelGenerator::visit(const ir::operation::Gather &node)
   //      a model. For example, if a model in NHWC has this operation as output rank == 4, indices
   //      rank == 2 and axis == 2, this operation should work as the axis W and C, but the axis W
   //      and C are not sequential in NCHW. So the backend in NCHW cannot handle this case.
-  const auto backend_layout = ofm_alloc->layout();
+  const auto backend_layout = ofm_tensor->layout();
   UNUSED_RELEASE(backend_layout);
-  assert(backend_layout == ifm_alloc->layout());
-  assert(backend_layout == indices_alloc->layout());
+  assert(backend_layout == ifm_tensor->layout());
+  assert(backend_layout == indices_tensor->layout());
   assert(ifm_rank < 4 || _current_op_seq_layout == backend_layout);
 
   auto fn = std::make_unique<::arm_compute::CLGatherEx>();
 
   // input is n-D, indices k-D, output is (n + k - 1)-D
   size_t n = ifm_rank;
-  assert(n == ifm_alloc->num_dimensions());
+  assert(n == ifm_tensor->num_dimensions());
   size_t k = _ctx.at(indices_index).shape().rank();
-  assert(k == indices_alloc->num_dimensions());
+  assert(k == indices_tensor->num_dimensions());
 
   // Disable applied dim_correction
-  const auto orig_ifm_acl_tensor_shape = ifm_alloc->info()->tensor_shape();
-  if (n != ifm_alloc->info()->num_dimensions())
+  const auto orig_ifm_acl_tensor_shape = ifm_tensor->info()->tensor_shape();
+  if (n != ifm_tensor->info()->num_dimensions())
   {
     // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
     const auto ifm = _ctx.at(ifm_index);
-    ifm_alloc->info()->set_tensor_shape(
+    ifm_tensor->info()->set_tensor_shape(
         acl_common::asTensorShape(ifm.shape(), _current_op_seq_layout, backend_layout, false));
   }
-  const auto orig_indice_acl_tensor_shape = indices_alloc->info()->tensor_shape();
-  if (k != indices_alloc->info()->num_dimensions())
+  const auto orig_indice_acl_tensor_shape = indices_tensor->info()->tensor_shape();
+  if (k != indices_tensor->info()->num_dimensions())
   {
     // This means that high dimension's value is 1 and indices tensor is applied dim_correction
     const auto indices = _ctx.at(indices_index);
-    indices_alloc->info()->set_tensor_shape(
+    indices_tensor->info()->set_tensor_shape(
         acl_common::asTensorShape(indices.shape(), _current_op_seq_layout, backend_layout, false));
   }
 
-  fn->configure(ifm_alloc->handle(), indices_alloc->handle(), ofm_alloc->handle(), axis);
+  fn->configure(ifm_tensor->handle(), indices_tensor->handle(), ofm_tensor->handle(), axis);
 
   // Revert disabling applied dim_correction
-  ifm_alloc->info()->set_tensor_shape(orig_ifm_acl_tensor_shape);
-  indices_alloc->info()->set_tensor_shape(orig_indice_acl_tensor_shape);
+  ifm_tensor->info()->set_tensor_shape(orig_ifm_acl_tensor_shape);
+  indices_tensor->info()->set_tensor_shape(orig_indice_acl_tensor_shape);
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -1719,12 +1531,12 @@ void KernelGenerator::visit(const ir::operation::Neg &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::Neg::Input::INPUT)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
 
   auto fn = std::make_unique<::arm_compute::CLNeg>();
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle());
+  fn->configure(ifm_tensor->handle(), ofm_tensor->handle());
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -1736,15 +1548,15 @@ void KernelGenerator::visit(const ir::operation::Abs &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::Abs::Input::INPUT)};
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input_tensor = _tensor_builder->at(input_index).get();
 
   const ::arm_compute::ActivationLayerInfo act_info{
       ::arm_compute::ActivationLayerInfo::ActivationFunction::ABS};
 
   auto fn = std::make_unique<::arm_compute::CLActivationLayer>();
 
-  fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
+  fn->configure(input_tensor->handle(), output_tensor->handle(), act_info);
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -1761,11 +1573,11 @@ void KernelGenerator::visit(const ir::operation::ArgMax &node)
 
   assert((ifm_shape.rank() - 1) == ofm_shape.rank());
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
   const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
   auto frontend_layout = _current_op_seq_layout;
-  auto backend_layout = ifm_alloc->layout();
+  auto backend_layout = ifm_tensor->layout();
 
   int axis_value = node.param().axis;
   if (axis_value < 0)
@@ -1776,10 +1588,10 @@ void KernelGenerator::visit(const ir::operation::ArgMax &node)
   auto acl_axis =
       acl_common::ToARMComputeAxis(ifm_rank, axis_value, frontend_layout, backend_layout).value();
 
-  auto fn = std::make_unique<::arm_compute::CLArgOperation>();
+  auto fn = std::make_unique<::arm_compute::CLArgMinMaxLayer>();
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), {acl_axis},
-                ::arm_compute::ArgOperation::MAX);
+  fn->configure(ifm_tensor->handle(), acl_axis, ofm_tensor->handle(),
+                ::arm_compute::ReductionOperation::ARG_IDX_MAX);
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -1791,12 +1603,12 @@ void KernelGenerator::visit(const ir::operation::Dequantize &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::Dequantize::Input::INPUT)};
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input_tensor = _tensor_builder->at(input_index).get();
 
-  auto fn = std::make_unique<::arm_compute::CLCast>();
+  auto fn = std::make_unique<::arm_compute::CLDequantizationLayer>();
 
-  fn->configure(input_alloc->handle(), output_alloc->handle(), arm_compute::SubDataType::NONE);
+  fn->configure(input_tensor->handle(), output_tensor->handle());
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -1814,15 +1626,15 @@ void KernelGenerator::visit(const ir::operation::LocalResponseNormalization &nod
   auto beta = node.param().beta;
   auto bias = node.param().bias;
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
 
   const auto norm_info = ::arm_compute::NormalizationLayerInfo(
       ::arm_compute::NormType::CROSS_MAP, radius * 2 + 1, alpha, beta, bias, false);
 
   auto fn = std::make_unique<::arm_compute::CLNormalizationLayer>();
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), norm_info);
+  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -1837,12 +1649,12 @@ void KernelGenerator::visit(const ir::operation::DepthToSpace &node)
   auto block_size = node.param().block_size;
   assert(block_size > 0);
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input_tensor = _tensor_builder->at(input_index).get();
 
-  auto fn = std::make_unique<::arm_compute::CLDepthToSpace>();
+  auto fn = std::make_unique<::arm_compute::CLDepthToSpaceLayer>();
 
-  fn->configure(input_alloc->handle(), output_alloc->handle(), block_size);
+  fn->configure(input_tensor->handle(), output_tensor->handle(), block_size);
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -1860,13 +1672,13 @@ void KernelGenerator::visit(const ir::operation::Split &node)
   for (const auto &output : node.getOutputs())
     output_indexes.emplace_back(output);
 
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
-  std::vector<arm_compute::ICLTensor *> output_allocs;
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+  std::vector<arm_compute::ICLTensor *> output_tensors;
   for (const auto &ofm_ind : output_indexes)
-    output_allocs.emplace_back(_tensor_builder->at(ofm_ind).get()->handle());
+    output_tensors.emplace_back(_tensor_builder->at(ofm_ind).get()->handle());
 
   const auto frontend_layout = _current_op_seq_layout;
-  const auto backend_layout = ifm_alloc->layout();
+  const auto backend_layout = ifm_tensor->layout();
   auto axis = node.param().axis;
   if (axis < 0)
     axis += ifm_rank;
@@ -1874,7 +1686,7 @@ void KernelGenerator::visit(const ir::operation::Split &node)
 
   auto fn = std::make_unique<::arm_compute::CLSplit>();
 
-  fn->configure(ifm_alloc->handle(), output_allocs, axis);
+  fn->configure(ifm_tensor->handle(), output_tensors, axis);
 
   _return_fn = asAclClFunction(std::move(fn));
 }
@@ -1906,13 +1718,13 @@ void KernelGenerator::visit(const ir::operation::Unpack &node)
   for (const auto &output_index : output_indexes)
   {
     size_t output_rank = _ctx.at(output_index).shape().rank();
-    const auto &output_alloc = _tensor_builder->at(output_index);
-    orig_outputs_acl_tensor_shapes.emplace_back(output_alloc->info()->tensor_shape());
-    assert(output_rank == output_alloc->num_dimensions());
-    if (output_rank != output_alloc->info()->num_dimensions())
+    const auto &output_tensor = _tensor_builder->at(output_index);
+    orig_outputs_acl_tensor_shapes.emplace_back(output_tensor->info()->tensor_shape());
+    assert(output_rank == output_tensor->num_dimensions());
+    if (output_rank != output_tensor->info()->num_dimensions())
     {
       // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
-      output_alloc->info()->set_tensor_shape(acl_common::asTensorShape(
+      output_tensor->info()->set_tensor_shape(acl_common::asTensorShape(
           _ctx.at(output_index).shape(), _current_op_seq_layout, backend_layout, false));
     }
   }
@@ -1959,12 +1771,12 @@ void KernelGenerator::visit(const ir::operation::Pad &node)
 
   // Disable applied dim_correction
   size_t input_rank = _ctx.at(input_index).shape().rank();
-  const auto &input_alloc = _tensor_builder->at(input_index);
-  assert(input_rank == input_alloc->num_dimensions());
-  if (input_rank != input_alloc->info()->num_dimensions())
+  const auto &input_tensor = _tensor_builder->at(input_index);
+  assert(input_rank == input_tensor->num_dimensions());
+  if (input_rank != input_tensor->info()->num_dimensions())
   {
     // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
-    input_alloc->info()->set_tensor_shape(acl_common::asTensorShape(
+    input_tensor->info()->set_tensor_shape(acl_common::asTensorShape(
         _ctx.at(input_index).shape(), frontend_layout, backend_layout, false));
   }
 
@@ -1982,13 +1794,13 @@ void KernelGenerator::visit(const ir::operation::Min &node)
   const auto lhs_index{node.getInputs().at(ir::operation::Min::Input::LHS)};
   const auto rhs_index{node.getInputs().at(ir::operation::Min::Input::RHS)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto lhs_alloc = _tensor_builder->at(lhs_index).get();
-  auto rhs_alloc = _tensor_builder->at(rhs_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto lhs_tensor = _tensor_builder->at(lhs_index).get();
+  auto rhs_tensor = _tensor_builder->at(rhs_index).get();
 
   auto fn = std::make_unique<::arm_compute::CLElementwiseMin>();
 
-  fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
+  fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -2001,13 +1813,13 @@ void KernelGenerator::visit(const ir::operation::Max &node)
   const auto lhs_index{node.getInputs().at(ir::operation::Max::Input::LHS)};
   const auto rhs_index{node.getInputs().at(ir::operation::Max::Input::RHS)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto lhs_alloc = _tensor_builder->at(lhs_index).get();
-  auto rhs_alloc = _tensor_builder->at(rhs_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto lhs_tensor = _tensor_builder->at(lhs_index).get();
+  auto rhs_tensor = _tensor_builder->at(rhs_index).get();
 
   auto fn = std::make_unique<::arm_compute::CLElementwiseMax>();
 
-  fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
+  fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -2019,12 +1831,12 @@ void KernelGenerator::visit(const ir::operation::ConvertFp32ToFp16 &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::ConvertFp32ToFp16::Input::INPUT)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
 
   auto fn = std::make_unique<::arm_compute::CLDepthConvertLayer>();
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), ::arm_compute::ConvertPolicy::SATURATE,
+  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::ConvertPolicy::SATURATE,
                 0);
 
   auto acl_fn = asAclClFunction(std::move(fn));
@@ -2037,12 +1849,12 @@ void KernelGenerator::visit(const ir::operation::ConvertFp16ToFp32 &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::ConvertFp16ToFp32::Input::INPUT)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
 
   auto fn = std::make_unique<::arm_compute::CLDepthConvertLayer>();
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), ::arm_compute::ConvertPolicy::SATURATE,
+  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::ConvertPolicy::SATURATE,
                 0);
 
   auto acl_fn = asAclClFunction(std::move(fn));
diff --git a/runtime/onert/backend/acl_common/AclKernelGen.h b/runtime/onert/backend/acl_common/AclKernelGen.h
new file mode 100644
index 0000000..6253434
--- /dev/null
+++ b/runtime/onert/backend/acl_common/AclKernelGen.h
@@ -0,0 +1,269 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_ACL_COMMON_ACL_KERNEL_GEN_H_
+#define __ONERT_BACKEND_ACL_COMMON_ACL_KERNEL_GEN_H_
+
+#include <exec/IFunction.h>
+#include <ir/Operands.h>
+
+#include <ir/operation/LSTM.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace acl_common
+{
+
+template <typename T_FunctionWrapper, typename T_Tensor, typename T_ACLLayer,
+          typename T_TensorBuilder>
+std::unique_ptr<exec::IFunction>
+kernelGenLSTM(const ir::operation::LSTM &node, const ir::Operands &operands,
+              const std::shared_ptr<T_TensorBuilder> &tensor_builder)
+{
+  // TODO Support dynamic rnn
+  // TODO Fix subtle error in the case of non-CIFG, non-peephole and No Projection.
+  const auto scratch_buffer_index{
+      node.getOutputs().at(ir::operation::LSTM::Output::SCRATCH_BUFFER)};
+  const auto output_state_out_index{
+      node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT_STATE_OUT)};
+  const auto cell_state_out_index{
+      node.getOutputs().at(ir::operation::LSTM::Output::CELL_STATE_OUT)};
+  const auto output_index{node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT)};
+
+  const auto input_index{node.getInputs().at(ir::operation::LSTM::Input::INPUT)};
+  const auto input_to_input_weights_index{
+      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_INPUT_WEIGHTS)}; // optional
+  const auto input_to_forget_weights_index{
+      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_FORGET_WEIGHTS)};
+  const auto input_to_cell_weights_index{
+      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_CELL_WEIGHTS)};
+  const auto input_to_output_weights_index{
+      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_OUTPUT_WEIGHTS)};
+  const auto recurrent_to_input_weights_index{
+      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_INPUT_WEIGHTS)}; // optional
+  const auto recurrent_to_forget_weights_index{
+      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_FORGET_WEIGHTS)};
+  const auto recurrent_to_cell_weights_index{
+      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_CELL_WEIGHTS)};
+  const auto recurrent_to_output_weights_index{
+      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_OUTPUT_WEIGHTS)};
+  const auto cell_to_input_weights_index{
+      node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_INPUT_WEIGHTS)}; // optional
+  const auto cell_to_forget_weights_index{
+      node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_FORGET_WEIGHTS)}; // optional
+  const auto cell_to_output_weights_index{
+      node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_OUTPUT_WEIGHTS)}; // optional
+  const auto input_gate_bias_index{
+      node.getInputs().at(ir::operation::LSTM::Input::INPUT_GATE_BIAS)};
+  const auto forget_gate_bias_index{
+      node.getInputs().at(ir::operation::LSTM::Input::FORGET_GATE_BIAS)};
+  const auto cell_bias_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_BIAS)};
+  const auto output_gate_bias_index{
+      node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_GATE_BIAS)};
+  const auto projection_weights_index{
+      node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_WEIGHTS)}; // optional
+  const auto projection_bias_index{
+      node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_BIAS)}; // optional
+  const auto output_state_in_index{
+      node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_STATE_IN)};
+  const auto cell_state_in_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_STATE_IN)};
+  const auto cell_threshold = node.param().cell_threshold;
+  const auto projection_threshold = node.param().projection_threshold;
+
+  bool has_input_to_input_weights = operands.at(input_to_input_weights_index).shape().dim(0) != 0 &&
+                                    operands.at(input_to_input_weights_index).shape().dim(1) != 0;
+  bool has_recurrent_to_input_weights =
+      operands.at(recurrent_to_input_weights_index).shape().dim(0) != 0 &&
+      operands.at(recurrent_to_input_weights_index).shape().dim(1) != 0;
+  bool has_cell_to_forget_weights = operands.at(cell_to_forget_weights_index).shape().dim(0) != 0;
+  bool has_cell_to_output_weights = operands.at(cell_to_output_weights_index).shape().dim(0) != 0;
+  bool has_projection_weights = operands.at(projection_weights_index).shape().dim(0) != 0 &&
+                                operands.at(projection_weights_index).shape().dim(1) != 0;
+  bool has_projection_bias = operands.at(projection_bias_index).shape().dim(0);
+
+  // NOTE The input_to_input_weights and the recurrent_to_input_weights do not exist in CIFG.
+  // true: no CIFG
+  // false: CIFG
+  // NOTE The cell_to_input_weights does not exist in non-peephole although regular LSTM(non-CIFG).
+  bool has_cifg_param = has_input_to_input_weights && has_recurrent_to_input_weights;
+
+  // NOTE The cell_to_forget_weights and the cell_to_output_weights exist in peephole.
+  // But the cell_to_input_weights does not exist in regular CIFG although peephole.
+  // true: peephole
+  // false: no peephole
+  bool has_peephole_param = has_cell_to_forget_weights && has_cell_to_output_weights;
+
+  // NOTE Although the projection weights has data the projection bias may not have data.
+  bool has_projection_param = has_projection_weights;
+
+  const auto activation = node.param().activation;
+  const auto cell_clip = cell_threshold;
+  const auto projection_clip = projection_threshold;
+  assert(cell_clip >= 0.f && projection_clip >= 0.f);
+
+  auto scratch_buffer_tensor = tensor_builder->at(scratch_buffer_index).get();
+  auto output_state_out_tensor = tensor_builder->at(output_state_out_index).get();
+  auto cell_state_out_tensor = tensor_builder->at(cell_state_out_index).get();
+  auto output_tensor = tensor_builder->at(output_index).get();
+
+  auto input_tensor = tensor_builder->at(input_index).get();
+
+  auto input_to_forget_weights_tensor = tensor_builder->at(input_to_forget_weights_index).get();
+  auto input_to_cell_weights_tensor = tensor_builder->at(input_to_cell_weights_index).get();
+  auto input_to_output_weights_tensor = tensor_builder->at(input_to_output_weights_index).get();
+  auto recurrent_to_forget_weights_tensor =
+      tensor_builder->at(recurrent_to_forget_weights_index).get();
+  auto recurrent_to_cell_weights_tensor = tensor_builder->at(recurrent_to_cell_weights_index).get();
+  auto recurrent_to_output_weights_tensor =
+      tensor_builder->at(recurrent_to_output_weights_index).get();
+
+  auto forget_gate_bias_tensor = tensor_builder->at(forget_gate_bias_index).get();
+  auto cell_bias_tensor = tensor_builder->at(cell_bias_index).get();
+  auto output_gate_bias_tensor = tensor_builder->at(output_gate_bias_index).get();
+  auto output_state_in_tensor = tensor_builder->at(output_state_in_index).get();
+  auto cell_state_in_tensor = tensor_builder->at(cell_state_in_index).get();
+
+  auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation);
+
+  auto fn = std::make_unique<T_ACLLayer>();
+
+  ::arm_compute::LSTMParams<T_Tensor> lstm_params{};
+  if (has_cifg_param)
+  {
+    auto input_to_input_weights_tensor =
+        tensor_builder->at(input_to_input_weights_index).get(); // optional
+    auto recurrent_to_input_weights_tensor =
+        tensor_builder->at(recurrent_to_input_weights_index).get(); // optional
+    auto cell_to_input_weights_handle =
+        has_peephole_param ? tensor_builder->at(cell_to_input_weights_index).get()->handle()
+                           : nullptr; // optional (non-cifg && peephole)
+    auto input_gate_bias_tensor = tensor_builder->at(input_gate_bias_index).get(); // optional
+    lstm_params.set_cifg_params(input_to_input_weights_tensor->handle(),
+                                recurrent_to_input_weights_tensor->handle(),
+                                cell_to_input_weights_handle, input_gate_bias_tensor->handle());
+  }
+  if (has_peephole_param)
+  {
+    auto cell_to_forget_weights_tensor =
+        tensor_builder->at(cell_to_forget_weights_index).get(); // optional
+    auto cell_to_output_weights_tensor =
+        tensor_builder->at(cell_to_output_weights_index).get(); // optional
+    lstm_params.set_peephole_params(cell_to_forget_weights_tensor->handle(),
+                                    cell_to_output_weights_tensor->handle());
+  }
+  if (has_projection_param)
+  {
+    auto projection_weights_tensor = tensor_builder->at(projection_weights_index).get(); // optional
+    auto projection_bias_handle = has_projection_bias
+                                      ? tensor_builder->at(projection_bias_index).get()->handle()
+                                      : nullptr; // optional
+    lstm_params.set_projection_params(projection_weights_tensor->handle(), projection_bias_handle);
+  }
+
+  fn->configure(input_tensor->handle(), input_to_forget_weights_tensor->handle(),
+                input_to_cell_weights_tensor->handle(), input_to_output_weights_tensor->handle(),
+                recurrent_to_forget_weights_tensor->handle(),
+                recurrent_to_cell_weights_tensor->handle(),
+                recurrent_to_output_weights_tensor->handle(), forget_gate_bias_tensor->handle(),
+                cell_bias_tensor->handle(), output_gate_bias_tensor->handle(),
+                output_state_in_tensor->handle(), cell_state_in_tensor->handle(),
+                scratch_buffer_tensor->handle(), output_state_out_tensor->handle(),
+                cell_state_out_tensor->handle(), output_tensor->handle(), lstm_params, act_info,
+                cell_clip, projection_clip);
+
+  return std::make_unique<T_FunctionWrapper>(std::move(fn));
+}
+
+template <typename T_FunctionWrapper, typename T_Tensor, typename T_ACLLayer,
+          typename T_TensorBuilder>
+std::unique_ptr<exec::IFunction>
+kernelGenFullyConnected(const ir::operation::FullyConnected &node, const ir::Operands &operands,
+                        const std::shared_ptr<T_TensorBuilder> &tensor_builder, ir::Layout layout)
+{
+  using ir::operation::FullyConnected;
+
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(FullyConnected::Input::INPUT)};
+  const auto weight_index{node.getInputs().at(FullyConnected::Input::WEIGHT)};
+  const auto bias_index{node.getInputs().at(FullyConnected::Input::BIAS)};
+
+  const auto input_rank = operands.at(input_index).shape().rank();
+
+  const auto output_size =
+      operands.at(output_index).shape().dim(operands.at(output_index).shape().rank() - 1);
+  UNUSED_RELEASE(output_size);
+  assert(operands.at(bias_index).shape().dim(0) == output_size);
+  assert(operands.at(weight_index).shape().dim(0) == output_size);
+  const auto batch_size =
+      operands.at(output_index).shape().dim(operands.at(output_index).shape().rank() - 2);
+  const auto input_size =
+      operands.at(weight_index).shape().dim(operands.at(weight_index).shape().rank() - 1);
+
+  // Check for reshaping input's shape into rank-2
+  bool needs_reshape = false;
+  ir::Shape reshape(2);
+  if (input_rank == 3 || input_rank == 4)
+  {
+    const auto &ifm_shape = operands.at(input_index).shape();
+    auto feature_size = 1;
+    for (int i = 0; i < ifm_shape.rank(); ++i)
+    {
+      feature_size *= ifm_shape.dim(i);
+    }
+
+    UNUSED_RELEASE(feature_size);
+    assert(feature_size == batch_size * input_size);
+
+    // for reshaping
+    needs_reshape = true;
+    reshape.dim(0) = batch_size; /* H */
+    reshape.dim(1) = input_size; /* W */
+  }
+
+  auto output_tensor = tensor_builder->at(output_index).get();
+  const auto input_tensor = tensor_builder->at(input_index).get();
+  const auto weight_tensor = tensor_builder->at(weight_index).get();
+  const auto bias_tensor = tensor_builder->at(bias_index).get();
+  const auto frontend_layout = layout;
+  const auto acl_layout = output_tensor->handle()->info()->data_layout();
+
+  auto fn =
+      std::make_unique<T_ACLLayer>(tensor_builder->acl_tensor_manager()->internal_buffer_manager());
+
+  typename T_ACLLayer::KernelType kernel_type = T_ACLLayer::KernelType::GENERAL;
+  if (operands.at(weight_index).isConstant())
+  {
+    kernel_type = T_ACLLayer::KernelType::PREPROCESSED_WEIGHTS;
+    assert(operands.at(weight_index).data());
+  }
+
+  fn->configure(
+      input_tensor->handle(), weight_tensor->handle(), bias_tensor->handle(),
+      output_tensor->handle(), needs_reshape,
+      ::onert::backend::acl_common::asTensorShape(
+          reshape, frontend_layout, ::onert::backend::acl_common::asRuntimeLayout(acl_layout)),
+      kernel_type);
+
+  return std::make_unique<T_FunctionWrapper>(std::move(fn));
+}
+
+} // namespace acl_common
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_ACL_COMMON_ACL_KERNEL_GEN_H_
diff --git a/runtime/onert/backend/acl_neon/KernelGenerator.cc b/runtime/onert/backend/acl_neon/KernelGenerator.cc
index e471867..37ec993 100644
--- a/runtime/onert/backend/acl_neon/KernelGenerator.cc
+++ b/runtime/onert/backend/acl_neon/KernelGenerator.cc
@@ -31,6 +31,7 @@
 #include "exec/NopFunction.h"
 #include "util/logging.h"
 #include "util/Utils.h"
+#include "AclKernelGen.h"
 
 namespace onert
 {
@@ -74,15 +75,15 @@ void KernelGenerator::visit(const ir::operation::Abs &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::Abs::Input::INPUT)};
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input_tensor = _tensor_builder->at(input_index).get();
 
   const ::arm_compute::ActivationLayerInfo act_info{
       ::arm_compute::ActivationLayerInfo::ActivationFunction::ABS};
 
   auto fn = std::make_unique<::arm_compute::NEActivationLayer>();
 
-  fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
+  fn->configure(input_tensor->handle(), output_tensor->handle(), act_info);
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -96,10 +97,10 @@ void KernelGenerator::visit(const ir::operation::ArgMax &node)
 
   const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
   auto frontend_layout = _current_op_seq_layout;
-  auto backend_layout = ifm_alloc->layout();
+  auto backend_layout = ifm_tensor->layout();
 
   int axis_value = node.param().axis;
   if (axis_value < 0)
@@ -112,7 +113,7 @@ void KernelGenerator::visit(const ir::operation::ArgMax &node)
 
   auto fn = std::make_unique<::arm_compute::NEArgMinMaxLayer>();
 
-  fn->configure(ifm_alloc->handle(), fixed_axis, ofm_alloc->handle(),
+  fn->configure(ifm_tensor->handle(), fixed_axis, ofm_tensor->handle(),
                 arm_compute::ReductionOperation::ARG_IDX_MAX);
 
   auto acl_fn = asAclFunction(std::move(fn));
@@ -127,15 +128,15 @@ void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node)
   const auto block_size_index{
       node.getInputs().at(ir::operation::BatchToSpaceND::Input::BLOCK_SIZE)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
-  auto block_size_alloc = _tensor_builder->at(block_size_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+  auto block_size_tensor = _tensor_builder->at(block_size_index).get();
 
   assert(_ctx.at(block_size_index).data());
 
   auto fn = std::make_unique<::arm_compute::NEBatchToSpaceLayer>();
 
-  fn->configure(ifm_alloc->handle(), block_size_alloc->handle(), ofm_alloc->handle());
+  fn->configure(ifm_tensor->handle(), block_size_tensor->handle(), ofm_tensor->handle());
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -147,15 +148,26 @@ void KernelGenerator::visit(const ir::operation::Cast &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::Cast::Input::INPUT)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
 
-  auto fn = std::make_unique<::arm_compute::NECast>();
+  std::unique_ptr<::arm_compute::IFunction> fn;
+  if (ifm_tensor->data_type() == ofm_tensor->data_type())
+  {
+    auto l = std::make_unique<::arm_compute::NECopy>();
+
+    l->configure(ifm_tensor->handle(), ofm_tensor->handle());
+
+    fn = std::move(l);
+  }
+  else
+  {
+    auto l = std::make_unique<::arm_compute::NECast>();
 
-  auto input_sub_type = _ctx.at(ifm_index).typeInfo().type() == ir::DataType::BOOL8
-                            ? arm_compute::SubDataType::BOOL
-                            : arm_compute::SubDataType::NONE;
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), input_sub_type);
+    l->configure(ifm_tensor->handle(), ofm_tensor->handle(), arm_compute::ConvertPolicy::SATURATE);
+
+    fn = std::move(l);
+  }
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -183,10 +195,10 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node)
                                             ker_width, ker_height);
   const auto activation = node.param().activation;
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
-  auto ker_alloc = _tensor_builder->at(ker_index).get();
-  auto bias_alloc = _tensor_builder->at(bias_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+  auto ker_tensor = _tensor_builder->at(ker_index).get();
+  auto bias_tensor = _tensor_builder->at(bias_index).get();
 
   const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
   const auto act_info = acl_common::asActivationLayerInfo(activation);
@@ -194,8 +206,9 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node)
   auto fn = std::make_unique<::arm_compute::NEConvolutionLayer>(
       _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
 
-  fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(), ofm_alloc->handle(),
-                conv_info, ::arm_compute::WeightsInfo(), ::arm_compute::Size2D(1U, 1U), act_info);
+  fn->configure(ifm_tensor->handle(), ker_tensor->handle(), bias_tensor->handle(),
+                ofm_tensor->handle(), conv_info, ::arm_compute::WeightsInfo(),
+                ::arm_compute::Size2D(1U, 1U), act_info);
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -208,12 +221,12 @@ void KernelGenerator::visit(const ir::operation::DepthToSpace &node)
   auto block_size = node.param().block_size;
   assert(block_size > 0);
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input_tensor = _tensor_builder->at(input_index).get();
 
-  auto fn = std::make_unique<::arm_compute::NEDepthToSpaceLayerEx>();
+  auto fn = std::make_unique<::arm_compute::NEDepthToSpaceLayer>();
 
-  fn->configure(input_alloc->handle(), output_alloc->handle(), block_size);
+  fn->configure(input_tensor->handle(), output_tensor->handle(), block_size);
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -242,10 +255,10 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
   const auto multiplier = node.param().multiplier;
   const auto activation = node.param().activation;
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
-  auto ker_alloc = _tensor_builder->at(ker_index).get();
-  auto bias_alloc = _tensor_builder->at(bias_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+  auto ker_tensor = _tensor_builder->at(ker_index).get();
+  auto bias_tensor = _tensor_builder->at(bias_index).get();
 
   const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
   const auto act_info = acl_common::asActivationLayerInfo(activation);
@@ -253,8 +266,8 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
   {
     auto fn = std::make_unique<::arm_compute::NEDepthwiseConvolutionLayer>();
 
-    fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(),
-                  ofm_alloc->handle(), conv_info, multiplier, act_info);
+    fn->configure(ifm_tensor->handle(), ker_tensor->handle(), bias_tensor->handle(),
+                  ofm_tensor->handle(), conv_info, multiplier, act_info);
 
     _return_fn = asAclFunction(std::move(fn));
   }
@@ -265,12 +278,12 @@ void KernelGenerator::visit(const ir::operation::Dequantize &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::Dequantize::Input::INPUT)};
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input_tensor = _tensor_builder->at(input_index).get();
 
   auto fn = std::make_unique<::arm_compute::NEDequantizationLayer>();
 
-  fn->configure(input_alloc->handle(), output_alloc->handle());
+  fn->configure(input_tensor->handle(), output_tensor->handle());
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -305,19 +318,19 @@ void KernelGenerator::visit(const ir::operation::MaxPool2D &node)
   VERBOSE(MaxPool2D) << "PAD(L): " << padding.left << std::endl;
   VERBOSE(MaxPool2D) << "PAD(R): " << padding.right << std::endl;
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
 
-  ::arm_compute::PoolingLayerInfo info{::arm_compute::PoolingType::MAX,
-                                       ::arm_compute::Size2D{kw, kh},
-                                       acl_common::asPadStrideInfo(padding, stride)};
+  ::arm_compute::PoolingLayerInfo info{
+      ::arm_compute::PoolingType::MAX, ::arm_compute::Size2D{kw, kh},
+      ifm_tensor->info()->data_layout(), acl_common::asPadStrideInfo(padding, stride)};
 
   auto fn = std::make_unique<::arm_compute::NEPoolingLayer>();
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info);
+  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), info);
 
   _return_fn = std::make_unique<exec::FunctionSequence>(
-      asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
+      asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
 }
 
 void KernelGenerator::visit(const ir::operation::AvgPool2D &node)
@@ -348,19 +361,20 @@ void KernelGenerator::visit(const ir::operation::AvgPool2D &node)
   VERBOSE(AvgPool2D) << "PAD(L): " << padding.left << std::endl;
   VERBOSE(AvgPool2D) << "PAD(R): " << padding.right << std::endl;
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
 
   ::arm_compute::PoolingLayerInfo info{
       ::arm_compute::PoolingType::AVG, ::arm_compute::Size2D{kw, kh},
-      acl_common::asPadStrideInfo(padding, stride), true /* exclude_padding */};
+      ifm_tensor->info()->data_layout(), acl_common::asPadStrideInfo(padding, stride),
+      true /* exclude_padding */};
 
   auto fn = std::make_unique<::arm_compute::NEPoolingLayer>();
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info);
+  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), info);
 
   _return_fn = std::make_unique<exec::FunctionSequence>(
-      asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
+      asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
 }
 
 void KernelGenerator::visit(const ir::operation::Concat &node)
@@ -383,7 +397,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
     return;
   }
 
-  auto output_alloc = _tensor_builder->at(ofm_index).get();
+  auto output_tensor = _tensor_builder->at(ofm_index).get();
   std::vector<::arm_compute::ITensor *> input_tensors;
   for (const auto &ifm_ind : input_indexes)
     input_tensors.emplace_back(_tensor_builder->at(ifm_ind)->handle());
@@ -392,7 +406,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
   if (input_indexes.size() < 2)
   {
     auto l = std::make_unique<::arm_compute::NECopy>();
-    l->configure(input_tensors.at(0), output_alloc->handle());
+    l->configure(input_tensors.at(0), output_tensor->handle());
     fn = std::move(l);
   }
   else
@@ -400,10 +414,10 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
     auto l = std::make_unique<::arm_compute::NEConcatenateLayer>();
     const auto rank = _ctx.at(ofm_index).shape().rank();
     const auto frontend_layout = _current_op_seq_layout;
-    const auto backend_layout = output_alloc->layout();
+    const auto backend_layout = output_tensor->layout();
     const auto fixed_axis =
         acl_common::ToARMComputeAxis(rank, axis, frontend_layout, backend_layout).value();
-    l->configure(input_tensors, output_alloc->handle(), fixed_axis);
+    l->configure(input_tensors, output_tensor->handle(), fixed_axis);
     fn = std::move(l);
   }
 
@@ -418,13 +432,13 @@ void KernelGenerator::visit(const ir::operation::EmbeddingLookup &node)
   const auto lookups_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::LOOKUPS)};
   const auto values_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::VALUES)};
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto lookups_alloc = _tensor_builder->at(lookups_index).get();
-  auto values_alloc = _tensor_builder->at(values_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto lookups_tensor = _tensor_builder->at(lookups_index).get();
+  auto values_tensor = _tensor_builder->at(values_index).get();
 
   auto fn = std::make_unique<::arm_compute::NEEmbeddingLookup>();
 
-  fn->configure(values_alloc->handle(), output_alloc->handle(), lookups_alloc->handle());
+  fn->configure(values_tensor->handle(), output_tensor->handle(), lookups_tensor->handle());
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -436,12 +450,12 @@ void KernelGenerator::visit(const ir::operation::Floor &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::Floor::Input::INPUT)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
 
   auto fn = std::make_unique<::arm_compute::NEFloor>();
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle());
+  fn->configure(ifm_tensor->handle(), ofm_tensor->handle());
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -450,76 +464,15 @@ void KernelGenerator::visit(const ir::operation::Floor &node)
 
 void KernelGenerator::visit(const ir::operation::FullyConnected &node)
 {
-  using ir::operation::FullyConnected;
-
   const auto output_index{node.getOutputs().at(0)};
-  const auto input_index{node.getInputs().at(FullyConnected::Input::INPUT)};
-  const auto weight_index{node.getInputs().at(FullyConnected::Input::WEIGHT)};
-  const auto bias_index{node.getInputs().at(FullyConnected::Input::BIAS)};
-
-  const auto input_rank = _ctx.at(input_index).shape().rank();
-
-  const auto output_size =
-      _ctx.at(output_index).shape().dim(_ctx.at(output_index).shape().rank() - 1);
-  UNUSED_RELEASE(output_size);
-  assert(_ctx.at(bias_index).shape().dim(0) == output_size);
-  assert(_ctx.at(weight_index).shape().dim(0) == output_size);
-  const auto batch_size =
-      _ctx.at(output_index).shape().dim(_ctx.at(output_index).shape().rank() - 2);
-  const auto input_size =
-      _ctx.at(weight_index).shape().dim(_ctx.at(weight_index).shape().rank() - 1);
-
-  // Check for reshaping input's shape into rank-2
-  bool needs_reshape = false;
-  ir::Shape reshape(2);
-  if (input_rank == 3 || input_rank == 4)
-  {
-    const auto &ifm_shape = _ctx.at(input_index).shape();
-    auto feature_size = 1;
-    for (int i = 0; i < ifm_shape.rank(); ++i)
-    {
-      feature_size *= ifm_shape.dim(i);
-    }
-
-    UNUSED_RELEASE(feature_size);
-    assert(feature_size == batch_size * input_size);
-
-    // for reshaping
-    needs_reshape = true;
-    reshape.dim(0) = batch_size; /* H */
-    reshape.dim(1) = input_size; /* W */
-  }
-
+  auto output_tensor = _tensor_builder->at(output_index).get();
   const auto activation = node.param().activation;
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  const auto input_alloc = _tensor_builder->at(input_index).get();
-  const auto weight_alloc = _tensor_builder->at(weight_index).get();
-  const auto bias_alloc = _tensor_builder->at(bias_index).get();
-  const auto frontend_layout = _current_op_seq_layout;
-  const auto acl_layout = output_alloc->handle()->info()->data_layout();
-
-  auto fn = std::make_unique<arm_compute::NEFullyConnectedReshapingLayer>(
-      _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
-
-  arm_compute::NEFullyConnectedReshapingLayer::KernelType kernel_type =
-      arm_compute::NEFullyConnectedReshapingLayer::KernelType::GENERAL;
-  if (_ctx.at(weight_index).isConstant())
-  {
-    kernel_type = arm_compute::NEFullyConnectedReshapingLayer::KernelType::PREPROCESSED_WEIGHTS;
-    assert(_ctx.at(weight_index).data());
-  }
-
-  fn->configure(
-      input_alloc->handle(), weight_alloc->handle(), bias_alloc->handle(), output_alloc->handle(),
-      needs_reshape,
-      ::onert::backend::acl_common::asTensorShape(
-          reshape, frontend_layout, ::onert::backend::acl_common::asRuntimeLayout(acl_layout)),
-      kernel_type);
-
+  auto fn = acl_common::kernelGenFullyConnected<acl_common::AclFunction, ::arm_compute::ITensor,
+                                                ::arm_compute::NEFullyConnectedReshapingLayer>(
+      node, _ctx, _tensor_builder, _current_op_seq_layout);
   _return_fn = std::make_unique<exec::FunctionSequence>(
-      asAclFunction(std::move(fn)),
-      ActivationBuilder::generate(activation, output_alloc->handle()));
+      std::move(fn), ActivationBuilder::generate(activation, output_tensor->handle()));
 }
 
 void KernelGenerator::visit(const ir::operation::HashtableLookup &node)
@@ -531,17 +484,17 @@ void KernelGenerator::visit(const ir::operation::HashtableLookup &node)
   const auto keys_index{node.getInputs().at(ir::operation::HashtableLookup::Input::KEYS)};
   const auto values_index{node.getInputs().at(ir::operation::HashtableLookup::Input::VALUES)};
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto hits_alloc = _tensor_builder->at(hits_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto hits_tensor = _tensor_builder->at(hits_index).get();
 
-  auto lookups_alloc = _tensor_builder->at(lookups_index).get();
-  auto keys_alloc = _tensor_builder->at(keys_index).get();
-  auto values_alloc = _tensor_builder->at(values_index).get();
+  auto lookups_tensor = _tensor_builder->at(lookups_index).get();
+  auto keys_tensor = _tensor_builder->at(keys_index).get();
+  auto values_tensor = _tensor_builder->at(values_index).get();
 
   auto fn = std::make_unique<::arm_compute::NEHashtableLookup>();
 
-  fn->configure(lookups_alloc->handle(), keys_alloc->handle(), values_alloc->handle(),
-                output_alloc->handle(), hits_alloc->handle());
+  fn->configure(lookups_tensor->handle(), keys_tensor->handle(), values_tensor->handle(),
+                output_tensor->handle(), hits_tensor->handle());
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -561,10 +514,10 @@ void KernelGenerator::visit(const ir::operation::Gather &node)
   // Converting in reverse order
   const int axis = ::onert::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value).value();
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
-  auto indices_alloc = _tensor_builder->at(indices_index).get();
-  const auto backend_layout = ofm_alloc->layout();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+  auto indices_tensor = _tensor_builder->at(indices_index).get();
+  const auto backend_layout = ofm_tensor->layout();
   UNUSED_RELEASE(backend_layout);
 
   // NOTE The frontend layout and backend layout must be the same for this operation.
@@ -575,35 +528,35 @@ void KernelGenerator::visit(const ir::operation::Gather &node)
   //      a model. For example, if a model in NHWC has this operation as output rank == 4, indices
   //      rank == 2 and axis == 2, this operation should work as the axis W and C, but the axis W
   //      and C are not sequential in NCHW. So the backend in NCHW cannot handle this case.
-  assert(backend_layout == ifm_alloc->layout());
-  assert(backend_layout == indices_alloc->layout());
+  assert(backend_layout == ifm_tensor->layout());
+  assert(backend_layout == indices_tensor->layout());
   assert(ifm_rank < 4 || _current_op_seq_layout == backend_layout);
 
   auto fn = std::make_unique<::arm_compute::NEGatherEx>();
 
   // input is n-D, indices k-D, output is (n + k - 1)-D
   size_t n = ifm_rank;
-  assert(n == ifm_alloc->num_dimensions());
+  assert(n == ifm_tensor->num_dimensions());
   size_t k = _ctx.at(indices_index).shape().rank();
-  assert(k == indices_alloc->num_dimensions());
+  assert(k == indices_tensor->num_dimensions());
 
   // Disable applied dim_correction
-  if (n != ifm_alloc->info()->num_dimensions())
+  if (n != ifm_tensor->info()->num_dimensions())
   {
     // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
     const auto ifm = _ctx.at(ifm_index);
-    ifm_alloc->info()->set_tensor_shape(
+    ifm_tensor->info()->set_tensor_shape(
         acl_common::asTensorShape(ifm.shape(), _current_op_seq_layout, backend_layout, false));
   }
-  if (k != indices_alloc->info()->num_dimensions())
+  if (k != indices_tensor->info()->num_dimensions())
   {
     // This means that high dimension's value is 1 and indices tensor is applied dim_correction
     const auto indices = _ctx.at(indices_index);
-    indices_alloc->info()->set_tensor_shape(
+    indices_tensor->info()->set_tensor_shape(
         acl_common::asTensorShape(indices.shape(), _current_op_seq_layout, backend_layout, false));
   }
 
-  fn->configure(ifm_alloc->handle(), indices_alloc->handle(), ofm_alloc->handle(), axis);
+  fn->configure(ifm_tensor->handle(), indices_tensor->handle(), ofm_tensor->handle(), axis);
 
   // acl_neon doesn't not revert disabling applied dim_correction because acl_neon's kernels would
   // use arm_compute::TensorInfo::offset_element_in_bytes()
@@ -621,20 +574,20 @@ void KernelGenerator::visit(const ir::operation::InstanceNorm &node)
   const auto gamma_index{node.getInputs().at(ir::operation::InstanceNorm::Input::GAMMA)};
   const auto beta_index{node.getInputs().at(ir::operation::InstanceNorm::Input::BETA)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
-  auto gamma_alloc = _tensor_builder->at(gamma_index).get();
-  auto beta_alloc = _tensor_builder->at(beta_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+  auto gamma_tensor = _tensor_builder->at(gamma_index).get();
+  auto beta_tensor = _tensor_builder->at(beta_index).get();
   auto epsilon = node.param().epsilon;
   auto activation = node.param().activation;
 
   auto fn = std::make_unique<::arm_compute::NEInstanceNormalizationLayerEx>();
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), gamma_alloc->handle(),
-                beta_alloc->handle(), epsilon);
+  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), gamma_tensor->handle(),
+                beta_tensor->handle(), epsilon);
 
   _return_fn = std::make_unique<exec::FunctionSequence>(
-      asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
+      asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
 }
 
 void KernelGenerator::visit(const ir::operation::L2Normalization &node)
@@ -656,15 +609,15 @@ void KernelGenerator::visit(const ir::operation::L2Normalization &node)
   float beta = 0.5f;                             // pow(reduction, -0.5) = 1 / sqrt(reduction)
   float bias = 0.0f;                             // Don't offset the reduction.
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
 
   const auto norm_info = ::arm_compute::NormalizationLayerInfo(::arm_compute::NormType::CROSS_MAP,
                                                                radius, alpha, beta, bias, false);
 
   auto fn = std::make_unique<::arm_compute::NENormalizationLayer>();
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), norm_info);
+  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -686,19 +639,20 @@ void KernelGenerator::visit(const ir::operation::L2Pool2D &node)
       ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh);
   const auto activation = node.param().activation;
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
 
   ::arm_compute::PoolingLayerInfo info{
       ::arm_compute::PoolingType::L2, ::arm_compute::Size2D{kw, kh},
+      ifm_tensor->info()->data_layout(),
       ::onert::backend::acl_common::asPadStrideInfo(padding, stride)};
 
   auto fn = std::make_unique<::arm_compute::NEPoolingLayer>();
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info);
+  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), info);
 
   _return_fn = std::make_unique<exec::FunctionSequence>(
-      asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
+      asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
 }
 
 void KernelGenerator::visit(const ir::operation::LocalResponseNormalization &node)
@@ -712,15 +666,15 @@ void KernelGenerator::visit(const ir::operation::LocalResponseNormalization &nod
   auto beta = node.param().beta;
   auto bias = node.param().bias;
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
 
   const auto norm_info = ::arm_compute::NormalizationLayerInfo(
       ::arm_compute::NormType::CROSS_MAP, radius * 2 + 1, alpha, beta, bias, false);
 
   auto fn = std::make_unique<::arm_compute::NENormalizationLayer>();
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), norm_info);
+  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -733,13 +687,13 @@ void KernelGenerator::visit(const ir::operation::LogicalAnd &node)
   const auto input0_index{node.getInputs().at(ir::operation::LogicalAnd::Input::INPUT0)};
   const auto input1_index{node.getInputs().at(ir::operation::LogicalAnd::Input::INPUT1)};
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input0_alloc = _tensor_builder->at(input0_index).get();
-  auto input1_alloc = _tensor_builder->at(input1_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input0_tensor = _tensor_builder->at(input0_index).get();
+  auto input1_tensor = _tensor_builder->at(input1_index).get();
 
   auto fn = std::make_unique<::arm_compute::NELogicalAnd>();
 
-  fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle());
+  fn->configure(input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle());
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -751,12 +705,12 @@ void KernelGenerator::visit(const ir::operation::LogicalNot &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::LogicalNot::Input::INPUT)};
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input_tensor = _tensor_builder->at(input_index).get();
 
   auto fn = std::make_unique<::arm_compute::NEBitwiseNot>();
 
-  fn->configure(input_alloc->handle(), output_alloc->handle());
+  fn->configure(input_tensor->handle(), output_tensor->handle());
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -769,13 +723,13 @@ void KernelGenerator::visit(const ir::operation::LogicalOr &node)
   const auto input0_index{node.getInputs().at(ir::operation::LogicalOr::Input::INPUT0)};
   const auto input1_index{node.getInputs().at(ir::operation::LogicalOr::Input::INPUT1)};
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input0_alloc = _tensor_builder->at(input0_index).get();
-  auto input1_alloc = _tensor_builder->at(input1_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input0_tensor = _tensor_builder->at(input0_index).get();
+  auto input1_tensor = _tensor_builder->at(input1_index).get();
 
   auto fn = std::make_unique<::arm_compute::NELogicalOr>();
 
-  fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle());
+  fn->configure(input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle());
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -787,8 +741,8 @@ void KernelGenerator::visit(const ir::operation::Logistic &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::Logistic::Input::INPUT)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
 
   const ::arm_compute::ActivationLayerInfo act_info{
       ::arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC};
@@ -798,7 +752,7 @@ void KernelGenerator::visit(const ir::operation::Logistic &node)
   // instead of 'INF', and then the result of this op will be errors due to the 'NaN'.
   auto fn = std::make_unique<::arm_compute::NEActivationLayerEx>();
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info);
+  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), act_info);
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -807,159 +761,8 @@ void KernelGenerator::visit(const ir::operation::Logistic &node)
 
 void KernelGenerator::visit(const ir::operation::LSTM &node)
 {
-  // TODO Support dynamic rnn
-  // TODO Fix subtle error in the case of non-CIFG, non-peephole and No Projection.
-  const auto scratch_buffer_index{
-      node.getOutputs().at(ir::operation::LSTM::Output::SCRATCH_BUFFER)};
-  const auto output_state_out_index{
-      node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT_STATE_OUT)};
-  const auto cell_state_out_index{
-      node.getOutputs().at(ir::operation::LSTM::Output::CELL_STATE_OUT)};
-  const auto output_index{node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT)};
-
-  const auto input_index{node.getInputs().at(ir::operation::LSTM::Input::INPUT)};
-  const auto input_to_input_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_INPUT_WEIGHTS)}; // optional
-  const auto input_to_forget_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_FORGET_WEIGHTS)};
-  const auto input_to_cell_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_CELL_WEIGHTS)};
-  const auto input_to_output_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_OUTPUT_WEIGHTS)};
-  const auto recurrent_to_input_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_INPUT_WEIGHTS)}; // optional
-  const auto recurrent_to_forget_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_FORGET_WEIGHTS)};
-  const auto recurrent_to_cell_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_CELL_WEIGHTS)};
-  const auto recurrent_to_output_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_OUTPUT_WEIGHTS)};
-  const auto cell_to_input_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_INPUT_WEIGHTS)}; // optional
-  const auto cell_to_forget_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_FORGET_WEIGHTS)}; // optional
-  const auto cell_to_output_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_OUTPUT_WEIGHTS)}; // optional
-  const auto input_gate_bias_index{
-      node.getInputs().at(ir::operation::LSTM::Input::INPUT_GATE_BIAS)};
-  const auto forget_gate_bias_index{
-      node.getInputs().at(ir::operation::LSTM::Input::FORGET_GATE_BIAS)};
-  const auto cell_bias_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_BIAS)};
-  const auto output_gate_bias_index{
-      node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_GATE_BIAS)};
-  const auto projection_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_WEIGHTS)}; // optional
-  const auto projection_bias_index{
-      node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_BIAS)}; // optional
-  const auto output_state_in_index{
-      node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_STATE_IN)};
-  const auto cell_state_in_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_STATE_IN)};
-  const auto cell_threshold = node.param().cell_threshold;
-  const auto projection_threshold = node.param().projection_threshold;
-
-  bool has_input_to_input_weights = _ctx.at(input_to_input_weights_index).shape().dim(0) != 0 &&
-                                    _ctx.at(input_to_input_weights_index).shape().dim(1) != 0;
-  bool has_recurrent_to_input_weights =
-      _ctx.at(recurrent_to_input_weights_index).shape().dim(0) != 0 &&
-      _ctx.at(recurrent_to_input_weights_index).shape().dim(1) != 0;
-  bool has_cell_to_forget_weights = _ctx.at(cell_to_forget_weights_index).shape().dim(0) != 0;
-  bool has_cell_to_output_weights = _ctx.at(cell_to_output_weights_index).shape().dim(0) != 0;
-  bool has_projection_weights = _ctx.at(projection_weights_index).shape().dim(0) != 0 &&
-                                _ctx.at(projection_weights_index).shape().dim(1) != 0;
-  bool has_projection_bias = _ctx.at(projection_bias_index).shape().dim(0);
-
-  // NOTE The input_to_input_weights and the recurrent_to_input_weights do not exist in CIFG.
-  // true: no CIFG
-  // false: CIFG
-  // NOTE The cell_to_input_weights does not exist in non-peephole although regular LSTM(non-CIFG).
-  bool has_cifg_param = has_input_to_input_weights && has_recurrent_to_input_weights;
-
-  // NOTE The cell_to_forget_weights and the cell_to_output_weights exist in peephole.
-  // But the cell_to_input_weights does not exist in regular CIFG although peephole.
-  // true: peephole
-  // false: no peephole
-  bool has_peephole_param = has_cell_to_forget_weights && has_cell_to_output_weights;
-
-  // NOTE Although the projection weights has data the projection bias may not have data.
-  bool has_projection_param = has_projection_weights;
-
-  const auto activation = node.param().activation;
-  const auto cell_clip = cell_threshold;
-  const auto projection_clip = projection_threshold;
-  assert(cell_clip >= 0.f && projection_clip >= 0.f);
-
-  auto scratch_buffer_alloc = _tensor_builder->at(scratch_buffer_index).get();
-  auto output_state_out_alloc = _tensor_builder->at(output_state_out_index).get();
-  auto cell_state_out_alloc = _tensor_builder->at(cell_state_out_index).get();
-  auto output_alloc = _tensor_builder->at(output_index).get();
-
-  auto input_alloc = _tensor_builder->at(input_index).get();
-
-  auto input_to_forget_weights_alloc = _tensor_builder->at(input_to_forget_weights_index).get();
-  auto input_to_cell_weights_alloc = _tensor_builder->at(input_to_cell_weights_index).get();
-  auto input_to_output_weights_alloc = _tensor_builder->at(input_to_output_weights_index).get();
-  auto recurrent_to_forget_weights_alloc =
-      _tensor_builder->at(recurrent_to_forget_weights_index).get();
-  auto recurrent_to_cell_weights_alloc = _tensor_builder->at(recurrent_to_cell_weights_index).get();
-  auto recurrent_to_output_weights_alloc =
-      _tensor_builder->at(recurrent_to_output_weights_index).get();
-
-  auto forget_gate_bias_alloc = _tensor_builder->at(forget_gate_bias_index).get();
-  auto cell_bias_alloc = _tensor_builder->at(cell_bias_index).get();
-  auto output_gate_bias_alloc = _tensor_builder->at(output_gate_bias_index).get();
-  auto output_state_in_alloc = _tensor_builder->at(output_state_in_index).get();
-  auto cell_state_in_alloc = _tensor_builder->at(cell_state_in_index).get();
-
-  auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation);
-
-  auto fn = std::make_unique<::arm_compute::NELSTMLayer>();
-
-  ::arm_compute::LSTMParams<::arm_compute::ITensor> lstm_params{};
-  if (has_cifg_param)
-  {
-    auto input_to_input_weights_alloc =
-        _tensor_builder->at(input_to_input_weights_index).get(); // optional
-    auto recurrent_to_input_weights_alloc =
-        _tensor_builder->at(recurrent_to_input_weights_index).get(); // optional
-    auto cell_to_input_weights_handle =
-        has_peephole_param ? _tensor_builder->at(cell_to_input_weights_index).get()->handle()
-                           : nullptr; // optional (non-cifg && peephole)
-    auto input_gate_bias_alloc = _tensor_builder->at(input_gate_bias_index).get(); // optional
-    lstm_params.set_cifg_params(input_to_input_weights_alloc->handle(),
-                                recurrent_to_input_weights_alloc->handle(),
-                                cell_to_input_weights_handle, input_gate_bias_alloc->handle());
-  }
-  if (has_peephole_param)
-  {
-    auto cell_to_forget_weights_alloc =
-        _tensor_builder->at(cell_to_forget_weights_index).get(); // optional
-    auto cell_to_output_weights_alloc =
-        _tensor_builder->at(cell_to_output_weights_index).get(); // optional
-    lstm_params.set_peephole_params(cell_to_forget_weights_alloc->handle(),
-                                    cell_to_output_weights_alloc->handle());
-  }
-  if (has_projection_param)
-  {
-    auto projection_weights_alloc = _tensor_builder->at(projection_weights_index).get(); // optional
-    auto projection_bias_handle = has_projection_bias
-                                      ? _tensor_builder->at(projection_bias_index).get()->handle()
-                                      : nullptr; // optional
-    lstm_params.set_projection_params(projection_weights_alloc->handle(), projection_bias_handle);
-  }
-
-  fn->configure(
-      input_alloc->handle(), input_to_forget_weights_alloc->handle(),
-      input_to_cell_weights_alloc->handle(), input_to_output_weights_alloc->handle(),
-      recurrent_to_forget_weights_alloc->handle(), recurrent_to_cell_weights_alloc->handle(),
-      recurrent_to_output_weights_alloc->handle(), forget_gate_bias_alloc->handle(),
-      cell_bias_alloc->handle(), output_gate_bias_alloc->handle(), output_state_in_alloc->handle(),
-      cell_state_in_alloc->handle(), scratch_buffer_alloc->handle(),
-      output_state_out_alloc->handle(), cell_state_out_alloc->handle(), output_alloc->handle(),
-      lstm_params, act_info, cell_clip, projection_clip);
-
-  auto acl_fn = asAclFunction(std::move(fn));
-
-  _return_fn = std::move(acl_fn);
+  _return_fn = acl_common::kernelGenLSTM<acl_common::AclFunction, ::arm_compute::ITensor,
+                                         ::arm_compute::NELSTMLayer>(node, _ctx, _tensor_builder);
 }
 
 void KernelGenerator::visit(const ir::operation::Mul &node)
@@ -970,18 +773,18 @@ void KernelGenerator::visit(const ir::operation::Mul &node)
 
   const auto activation = node.param().activation;
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto lhs_alloc = _tensor_builder->at(lhs_index).get();
-  auto rhs_alloc = _tensor_builder->at(rhs_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto lhs_tensor = _tensor_builder->at(lhs_index).get();
+  auto rhs_tensor = _tensor_builder->at(rhs_index).get();
 
   auto fn = std::make_unique<::arm_compute::NEPixelWiseMultiplication>();
 
   // RoundingPolicy for scale:1.0 is only allowed RoundingPolicy::TO_ZERO
-  fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(), 1.0, // scale
+  fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), 1.0, // scale
                 arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_ZERO);
 
   _return_fn = std::make_unique<exec::FunctionSequence>(
-      asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
+      asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
 }
 
 void KernelGenerator::visit(const ir::operation::Neg &node)
@@ -989,12 +792,12 @@ void KernelGenerator::visit(const ir::operation::Neg &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::Neg::Input::INPUT)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
 
   auto fn = std::make_unique<::arm_compute::NENegLayer>();
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle());
+  fn->configure(ifm_tensor->handle(), ofm_tensor->handle());
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -1030,12 +833,12 @@ void KernelGenerator::visit(const ir::operation::Pack &node)
   for (const auto &input_index : input_indexes)
   {
     size_t input_rank = _ctx.at(input_index).shape().rank();
-    const auto &input_alloc = _tensor_builder->at(input_index);
-    assert(input_rank == input_alloc->num_dimensions());
-    if (input_rank != input_alloc->info()->num_dimensions())
+    const auto &input_tensor = _tensor_builder->at(input_index);
+    assert(input_rank == input_tensor->num_dimensions());
+    if (input_rank != input_tensor->info()->num_dimensions())
     {
       // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
-      input_alloc->info()->set_tensor_shape(acl_common::asTensorShape(
+      input_tensor->info()->set_tensor_shape(acl_common::asTensorShape(
           _ctx.at(input_index).shape(), _current_op_seq_layout, backend_layout, false));
     }
   }
@@ -1094,8 +897,8 @@ void KernelGenerator::visit(const ir::operation::Permute &node)
   const auto ofm_idx{node.getOutputs().at(0)};
   const auto ifm_idx{node.getInputs().at(0)};
   const auto permute_type = node.getPermuteType();
-  auto ofm_alloc = _tensor_builder->at(ofm_idx).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_idx).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_idx).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_idx).get();
   const auto rank = _ctx.at(ofm_idx).shape().rank();
   assert(_ctx.at(ifm_idx).shape().rank() == _ctx.at(ofm_idx).shape().rank());
 
@@ -1108,7 +911,7 @@ void KernelGenerator::visit(const ir::operation::Permute &node)
 
     auto l = std::make_unique<::arm_compute::NEPermute>();
 
-    l->configure(ifm_alloc->handle(), ofm_alloc->handle(), pv);
+    l->configure(ifm_tensor->handle(), ofm_tensor->handle(), pv);
 
     fn = std::move(l);
   }
@@ -1119,7 +922,7 @@ void KernelGenerator::visit(const ir::operation::Permute &node)
 
     auto l = std::make_unique<::arm_compute::NEPermute>();
 
-    l->configure(ifm_alloc->handle(), ofm_alloc->handle(), pv);
+    l->configure(ifm_tensor->handle(), ofm_tensor->handle(), pv);
 
     fn = std::move(l);
   }
@@ -1127,7 +930,7 @@ void KernelGenerator::visit(const ir::operation::Permute &node)
   {
     auto l = std::make_unique<::arm_compute::NECopy>();
 
-    l->configure(ifm_alloc->handle(), ofm_alloc->handle());
+    l->configure(ifm_tensor->handle(), ofm_tensor->handle());
 
     fn = std::move(l);
   }
@@ -1143,15 +946,15 @@ void KernelGenerator::visit(const ir::operation::PReLU &node)
   const auto ifm_index{node.getInputs().at(ir::operation::PReLU::Input::INPUT)};
   const auto alpha_index{node.getInputs().at(ir::operation::PReLU::Input::ALPHA)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
-  auto alpha_alloc = _tensor_builder->at(alpha_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+  auto alpha_tensor = _tensor_builder->at(alpha_index).get();
 
   std::unique_ptr<::arm_compute::IFunction> fn;
 
-  auto l = std::make_unique<::arm_compute::NEPReLU>();
+  auto l = std::make_unique<::arm_compute::NEPReluLayer>();
 
-  l->configure(ifm_alloc->handle(), alpha_alloc->handle(), ofm_alloc->handle());
+  l->configure(ifm_tensor->handle(), alpha_tensor->handle(), ofm_tensor->handle());
 
   fn = std::move(l);
 
@@ -1166,14 +969,14 @@ void KernelGenerator::visit(const ir::operation::Reduce &node)
   const auto input_index{node.getInputs().at(ir::operation::Reduce::Input::INPUT)};
   const auto axes_index{node.getInputs().at(ir::operation::Reduce::Input::AXES)};
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input_tensor = _tensor_builder->at(input_index).get();
 
   // Convert to ACL axes taking into account negative values and possible duplicates.
   const auto &axes = _ctx.at(axes_index);
   const auto input_rank = _ctx.at(input_index).shape().rank();
   const auto frontend_layout = _current_op_seq_layout;
-  const auto backend_layout = input_alloc->layout();
+  const auto backend_layout = input_tensor->layout();
   const auto reduce_axes =
       acl_common::asCoordinates(axes, input_rank, frontend_layout, backend_layout);
   const auto reduce_type = node.param().reduce_type;
@@ -1182,11 +985,9 @@ void KernelGenerator::visit(const ir::operation::Reduce &node)
   std::unique_ptr<::arm_compute::IFunction> fn;
   if (reduce_type == ir::operation::Reduce::ReduceType::MEAN)
   {
-    // NOTE NEReduceMean has a bug that does not support NHWC layout
-    //      NEReduceMean intermediate tensors are always NCHW layout
-    auto l = std::make_unique<::arm_compute::NEReduceMeanEx>();
+    auto l = std::make_unique<::arm_compute::NEReduceMean>();
 
-    l->configure(input_alloc->handle(), reduce_axes, keep_dims, output_alloc->handle());
+    l->configure(input_tensor->handle(), reduce_axes, keep_dims, output_tensor->handle());
 
     fn = std::move(l);
   }
@@ -1194,7 +995,7 @@ void KernelGenerator::visit(const ir::operation::Reduce &node)
   {
     auto l = std::make_unique<::arm_compute::NEReduceSum>();
 
-    l->configure(input_alloc->handle(), reduce_axes, keep_dims, output_alloc->handle());
+    l->configure(input_tensor->handle(), reduce_axes, keep_dims, output_tensor->handle());
 
     fn = std::move(l);
   }
@@ -1202,7 +1003,7 @@ void KernelGenerator::visit(const ir::operation::Reduce &node)
   {
     auto l = std::make_unique<::arm_compute::NEReduceOperation>();
 
-    l->configure(input_alloc->handle(), reduce_axes, keep_dims, output_alloc->handle(),
+    l->configure(input_tensor->handle(), reduce_axes, keep_dims, output_tensor->handle(),
                  acl_common::convertReduceType(reduce_type));
 
     fn = std::move(l);
@@ -1218,15 +1019,15 @@ void KernelGenerator::visit(const ir::operation::ReLU &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::ReLU::Input::INPUT)};
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input_tensor = _tensor_builder->at(input_index).get();
 
   auto fn = std::make_unique<arm_compute::NEActivationLayer>();
 
   const ::arm_compute::ActivationLayerInfo act_info{
       ::arm_compute::ActivationLayerInfo::ActivationFunction::RELU};
 
-  fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
+  fn->configure(input_tensor->handle(), output_tensor->handle(), act_info);
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -1238,15 +1039,15 @@ void KernelGenerator::visit(const ir::operation::ReLU1 &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::ReLU1::Input::INPUT)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
 
   const ::arm_compute::ActivationLayerInfo act_info{
       ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1.0f, -1.0f};
 
   auto fn = std::make_unique<::arm_compute::NEActivationLayer>();
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info);
+  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), act_info);
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -1258,15 +1059,15 @@ void KernelGenerator::visit(const ir::operation::ReLU6 &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::ReLU6::Input::INPUT)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
 
   const ::arm_compute::ActivationLayerInfo act_info{
       ::arm_compute::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.0f};
 
   auto fn = std::make_unique<::arm_compute::NEActivationLayer>();
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info);
+  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), act_info);
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -1278,13 +1079,13 @@ void KernelGenerator::visit(const ir::operation::Reshape &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)};
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input_tensor = _tensor_builder->at(input_index).get();
 
   // NOTE This operation must not be changed the layout from frontend to backend
   //      So, PermutationOperationPass makes layouts of frontend and backend the same.
   const auto frontend_layout = _current_op_seq_layout;
-  const auto backend_layout = output_alloc->layout();
+  const auto backend_layout = output_tensor->layout();
   assert((_ctx.at(input_index).shape().rank() < 4 && _ctx.at(output_index).shape().rank() < 4) ||
          frontend_layout == backend_layout);
   UNUSED_RELEASE(frontend_layout);
@@ -1292,7 +1093,7 @@ void KernelGenerator::visit(const ir::operation::Reshape &node)
 
   auto fn = std::make_unique<arm_compute::NEReshapeLayer>();
 
-  fn->configure(input_alloc->handle(), output_alloc->handle());
+  fn->configure(input_tensor->handle(), output_tensor->handle());
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -1305,12 +1106,12 @@ void KernelGenerator::visit(const ir::operation::ResizeBilinear &node)
 
   const auto ifm_index{node.getInputs().at(ir::operation::ResizeBilinear::Input::INPUT)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
 
   auto fn = std::make_unique<::arm_compute::NEScale>();
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(),
+  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(),
                 ::arm_compute::InterpolationPolicy::BILINEAR, ::arm_compute::BorderMode::REPLICATE,
                 ::arm_compute::PixelValue(0.f), ::arm_compute::SamplingPolicy::TOP_LEFT);
 
@@ -1334,25 +1135,25 @@ void KernelGenerator::visit(const ir::operation::RNN &node)
 
   const auto activation = node.param().activation;
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto hidden_state_out_alloc = _tensor_builder->at(hidden_state_out_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto hidden_state_out_tensor = _tensor_builder->at(hidden_state_out_index).get();
 
-  auto input_alloc = _tensor_builder->at(input_index).get();
-  auto weights_alloc = _tensor_builder->at(weights_index).get();
-  auto recurrent_weights_alloc = _tensor_builder->at(recurrent_weights_index).get();
-  auto bias_alloc = _tensor_builder->at(bias_index).get();
-  auto hidden_state_in_alloc = _tensor_builder->at(hidden_state_in_index).get();
+  auto input_tensor = _tensor_builder->at(input_index).get();
+  auto weights_tensor = _tensor_builder->at(weights_index).get();
+  auto recurrent_weights_tensor = _tensor_builder->at(recurrent_weights_index).get();
+  auto bias_tensor = _tensor_builder->at(bias_index).get();
+  auto hidden_state_in_tensor = _tensor_builder->at(hidden_state_in_index).get();
   auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation);
 
   auto copy_layer = std::make_unique<::arm_compute::NECopy>();
-  copy_layer->configure(hidden_state_in_alloc->handle(), hidden_state_out_alloc->handle());
+  copy_layer->configure(hidden_state_in_tensor->handle(), hidden_state_out_tensor->handle());
   _return_fn = asAclFunction(std::move(copy_layer));
 
-  auto fn = std::make_unique<::arm_compute::NERNNLayerEx>(
+  auto fn = std::make_unique<::arm_compute::NERNNLayer>(
       _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
-  fn->configure(input_alloc->handle(), weights_alloc->handle(), recurrent_weights_alloc->handle(),
-                bias_alloc->handle(), hidden_state_out_alloc->handle(), output_alloc->handle(),
-                act_info);
+  fn->configure(input_tensor->handle(), weights_tensor->handle(),
+                recurrent_weights_tensor->handle(), bias_tensor->handle(),
+                hidden_state_out_tensor->handle(), output_tensor->handle(), act_info);
   _return_fn = asAclFunction(std::move(fn));
 }
 
@@ -1361,12 +1162,12 @@ void KernelGenerator::visit(const ir::operation::RSQRT &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::RSQRT::Input::INPUT)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
 
   auto fn = std::make_unique<::arm_compute::NERsqrtLayer>();
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle());
+  fn->configure(ifm_tensor->handle(), ofm_tensor->handle());
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -1383,10 +1184,10 @@ void KernelGenerator::visit(const ir::operation::Squeeze &node)
   (void)dims;
   (void)ndim;
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input_tensor = _tensor_builder->at(input_index).get();
   auto fn = std::make_unique<arm_compute::NEReshapeLayer>();
-  fn->configure(input_alloc->handle(), output_alloc->handle());
+  fn->configure(input_tensor->handle(), output_tensor->handle());
   auto acl_fn = asAclFunction(std::move(fn));
   _return_fn = std::move(acl_fn);
 }
@@ -1396,15 +1197,15 @@ void KernelGenerator::visit(const ir::operation::Tanh &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::Tanh::Input::INPUT)};
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input_tensor = _tensor_builder->at(input_index).get();
 
   auto fn = std::make_unique<arm_compute::NEActivationLayer>();
 
   const ::arm_compute::ActivationLayerInfo act_info{
       ::arm_compute::ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f};
 
-  fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
+  fn->configure(input_tensor->handle(), output_tensor->handle(), act_info);
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -1417,13 +1218,25 @@ void KernelGenerator::visit(const ir::operation::Softmax &node)
   const auto input_index{node.getInputs().at(ir::operation::Softmax::Input::INPUT)};
   const auto beta = node.param().beta;
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input_tensor = _tensor_builder->at(input_index).get();
+  const auto frontend_layout = _current_op_seq_layout;
+  const auto backend_layout = input_tensor->layout();
+
+  // Disable applied dim_correction
+  const size_t input_rank = _ctx.at(input_index).shape().rank();
+  if (input_rank != input_tensor->info()->num_dimensions())
+  {
+    // This means that high dimension's value is 1 and input tensor is applied dim_correction
+    const auto input = _ctx.at(input_index);
+    input_tensor->info()->set_tensor_shape(
+        acl_common::asTensorShape(input.shape(), frontend_layout, backend_layout, false));
+  }
 
   auto fn = std::make_unique<::arm_compute::NESoftmaxLayer>(
       _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
 
-  fn->configure(input_alloc->handle(), output_alloc->handle(), beta);
+  fn->configure(input_tensor->handle(), output_tensor->handle(), beta);
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -1438,20 +1251,18 @@ void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node)
       node.getInputs().at(ir::operation::SpaceToBatchND::Input::BLOCK_SIZE)};
   const auto paddings_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::PADDINGS)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
-  auto block_size_alloc = _tensor_builder->at(block_size_index).get();
-  auto paddings_alloc = _tensor_builder->at(paddings_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+  auto block_size_tensor = _tensor_builder->at(block_size_index).get();
+  auto paddings_tensor = _tensor_builder->at(paddings_index).get();
 
   assert(_ctx.at(block_size_index).data());
   assert(_ctx.at(paddings_index).data());
 
-  // NESpaceToBatchLayer has a bug that padding's values are 0 even when zero point of QASYMM8 is
-  // not 0.
-  auto fn = std::make_unique<::arm_compute::NESpaceToBatchLayerEx>();
+  auto fn = std::make_unique<::arm_compute::NESpaceToBatchLayer>();
 
-  fn->configure(ifm_alloc->handle(), block_size_alloc->handle(), paddings_alloc->handle(),
-                ofm_alloc->handle());
+  fn->configure(ifm_tensor->handle(), block_size_tensor->handle(), paddings_tensor->handle(),
+                ofm_tensor->handle());
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -1465,12 +1276,12 @@ void KernelGenerator::visit(const ir::operation::SpaceToDepth &node)
 
   auto block_size = node.param().block_size;
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
 
-  auto fn = std::make_unique<::arm_compute::NESpaceToDepthLayerEx>();
+  auto fn = std::make_unique<::arm_compute::NESpaceToDepthLayer>();
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), block_size);
+  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), block_size);
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -1489,13 +1300,13 @@ void KernelGenerator::visit(const ir::operation::Split &node)
   for (const auto &output : node.getOutputs())
     output_indexes.emplace_back(output);
 
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
-  std::vector<arm_compute::ITensor *> output_allocs;
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+  std::vector<arm_compute::ITensor *> output_tensors;
   for (const auto &ofm_ind : output_indexes)
-    output_allocs.emplace_back(_tensor_builder->at(ofm_ind).get()->handle());
+    output_tensors.emplace_back(_tensor_builder->at(ofm_ind).get()->handle());
 
   const auto frontend_layout = _current_op_seq_layout;
-  const auto backend_layout = ifm_alloc->layout();
+  const auto backend_layout = ifm_tensor->layout();
   auto axis = node.param().axis;
   if (axis < 0)
     axis += ifm_rank;
@@ -1503,7 +1314,7 @@ void KernelGenerator::visit(const ir::operation::Split &node)
 
   auto fn = std::make_unique<::arm_compute::NESplit>();
 
-  fn->configure(ifm_alloc->handle(), output_allocs, axis);
+  fn->configure(ifm_tensor->handle(), output_tensors, axis);
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -1513,15 +1324,15 @@ void KernelGenerator::visit(const ir::operation::SQRT &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::SQRT::Input::INPUT)};
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input_tensor = _tensor_builder->at(input_index).get();
 
   const ::arm_compute::ActivationLayerInfo act_info{
       ::arm_compute::ActivationLayerInfo::ActivationFunction::SQRT};
 
   auto fn = std::make_unique<::arm_compute::NEActivationLayer>();
 
-  fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
+  fn->configure(input_tensor->handle(), output_tensor->handle(), act_info);
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -1534,13 +1345,13 @@ void KernelGenerator::visit(const ir::operation::SquaredDifference &node)
   const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)};
   const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto lhs_alloc = _tensor_builder->at(lhs_index).get();
-  auto rhs_alloc = _tensor_builder->at(rhs_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto lhs_tensor = _tensor_builder->at(lhs_index).get();
+  auto rhs_tensor = _tensor_builder->at(rhs_index).get();
 
   auto fn = std::make_unique<::arm_compute::NEElementwiseSquaredDiff>();
 
-  fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
+  fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -1555,17 +1366,17 @@ void KernelGenerator::visit(const ir::operation::Sub &node)
 
   const auto activation = node.param().activation;
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto lhs_alloc = _tensor_builder->at(lhs_index).get();
-  auto rhs_alloc = _tensor_builder->at(rhs_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto lhs_tensor = _tensor_builder->at(lhs_index).get();
+  auto rhs_tensor = _tensor_builder->at(rhs_index).get();
 
   auto fn = std::make_unique<::arm_compute::NEArithmeticSubtraction>();
 
-  fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(),
+  fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
                 arm_compute::ConvertPolicy::SATURATE);
 
   _return_fn = std::make_unique<exec::FunctionSequence>(
-      asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
+      asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
 }
 
 void KernelGenerator::visit(const ir::operation::Slice &node)
@@ -1575,10 +1386,10 @@ void KernelGenerator::visit(const ir::operation::Slice &node)
   const auto begins_index{node.getInputs().at(ir::operation::Slice::Input::BEGINS)};
   const auto sizes_index{node.getInputs().at(ir::operation::Slice::Input::SIZES)};
 
-  auto outputData_alloc = _tensor_builder->at(output_index).get();
-  auto inputData_alloc = _tensor_builder->at(input_index).get();
+  auto outputData_tensor = _tensor_builder->at(output_index).get();
+  auto inputData_tensor = _tensor_builder->at(input_index).get();
   const auto frontend_layout = _current_op_seq_layout;
-  const auto backend_layout = inputData_alloc->layout();
+  const auto backend_layout = inputData_tensor->layout();
 
   // Set initializers for indices data such as order of inputData
   int input_rank = _ctx.at(input_index).shape().rank();
@@ -1628,7 +1439,7 @@ void KernelGenerator::visit(const ir::operation::Slice &node)
 
   auto fn = std::make_unique<::arm_compute::NESlice>();
 
-  fn->configure(inputData_alloc->handle(), outputData_alloc->handle(), starts_set, ends_set);
+  fn->configure(inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set);
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -1643,10 +1454,10 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node)
   const auto ends_index{node.getInputs().at(ir::operation::StridedSlice::Input::ENDS)};
   const auto strides_index{node.getInputs().at(ir::operation::StridedSlice::Input::STRIDES)};
 
-  auto outputData_alloc = _tensor_builder->at(output_index).get();
-  auto inputData_alloc = _tensor_builder->at(input_index).get();
+  auto outputData_tensor = _tensor_builder->at(output_index).get();
+  auto inputData_tensor = _tensor_builder->at(input_index).get();
   const auto frontend_layout = _current_op_seq_layout;
-  const auto backend_layout = inputData_alloc->layout();
+  const auto backend_layout = inputData_tensor->layout();
 
   // Set initializers for indices data such as order of inputData
   int input_rank = _ctx.at(input_index).shape().rank();
@@ -1715,7 +1526,7 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node)
 
   auto fn = std::make_unique<::arm_compute::NEStridedSlice>();
 
-  fn->configure(inputData_alloc->handle(), outputData_alloc->handle(), starts_set, ends_set,
+  fn->configure(inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set,
                 strides_set, begin_mask, end_mask, shrink_axis_mask);
 
   auto acl_fn = asAclFunction(std::move(fn));
@@ -1749,16 +1560,16 @@ void KernelGenerator::visit(const ir::operation::TransposeConv &node)
     invalid_vertical = ofm_shape.H - (1 + (ifm_shape.H - 1) * stride.vertical) - (ker_shape.H - 1);
   }
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
-  auto ker_alloc = _tensor_builder->at(ker_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+  auto ker_tensor = _tensor_builder->at(ker_index).get();
 
   const auto tconv_info = acl_common::asPadStrideInfo(padding, stride);
 
   auto fn = std::make_unique<::arm_compute::NETransposeConvLayer>();
 
-  fn->configure(ifm_alloc->handle(), ker_alloc->handle(), nullptr, ofm_alloc->handle(), tconv_info,
-                invalid_horizontal, invalid_vertical);
+  fn->configure(ifm_tensor->handle(), ker_tensor->handle(), nullptr, ofm_tensor->handle(),
+                tconv_info, invalid_horizontal, invalid_vertical);
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -1771,10 +1582,10 @@ void KernelGenerator::visit(const ir::operation::Transpose &node)
   const auto ifm_idx{node.getInputs().at(ir::operation::Transpose::Input::INPUT)};
   const auto &perm{node.param().perm};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_idx).get();
-  const auto ifm_alloc = _tensor_builder->at(ifm_idx).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_idx).get();
+  const auto ifm_tensor = _tensor_builder->at(ifm_idx).get();
   const auto frontend_layout = _current_op_seq_layout;
-  const auto backend_layout = ifm_alloc->layout();
+  const auto backend_layout = ifm_tensor->layout();
 
   const auto rank = _ctx.at(ifm_idx).shape().rank();
   std::vector<std::int32_t> pv(perm.cbegin(), perm.cend());
@@ -1783,11 +1594,11 @@ void KernelGenerator::visit(const ir::operation::Transpose &node)
 
   std::unique_ptr<::arm_compute::IFunction> fn;
 
-  if (ifm_alloc->num_dimensions() <= 2 && ofm_alloc->num_dimensions() <= 2)
+  if (ifm_tensor->num_dimensions() <= 2 && ofm_tensor->num_dimensions() <= 2)
   {
     auto l = std::make_unique<::arm_compute::NETranspose>();
 
-    l->configure(ifm_alloc->handle(), ofm_alloc->handle());
+    l->configure(ifm_tensor->handle(), ofm_tensor->handle());
 
     fn = std::move(l);
   }
@@ -1795,7 +1606,7 @@ void KernelGenerator::visit(const ir::operation::Transpose &node)
   {
     auto l = std::make_unique<::arm_compute::NEPermute>();
 
-    l->configure(ifm_alloc->handle(), ofm_alloc->handle(), backend_pv);
+    l->configure(ifm_tensor->handle(), ofm_tensor->handle(), backend_pv);
 
     fn = std::move(l);
   }
@@ -1834,13 +1645,13 @@ void KernelGenerator::visit(const ir::operation::Unpack &node)
   for (const auto &output_index : output_indexes)
   {
     size_t output_rank = _ctx.at(output_index).shape().rank();
-    const auto &output_alloc = _tensor_builder->at(output_index);
-    orig_outputs_acl_tensor_shapes.emplace_back(output_alloc->info()->tensor_shape());
-    assert(output_rank == output_alloc->num_dimensions());
-    if (output_rank != output_alloc->info()->num_dimensions())
+    const auto &output_tensor = _tensor_builder->at(output_index);
+    orig_outputs_acl_tensor_shapes.emplace_back(output_tensor->info()->tensor_shape());
+    assert(output_rank == output_tensor->num_dimensions());
+    if (output_rank != output_tensor->info()->num_dimensions())
     {
       // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
-      output_alloc->info()->set_tensor_shape(acl_common::asTensorShape(
+      output_tensor->info()->set_tensor_shape(acl_common::asTensorShape(
           _ctx.at(output_index).shape(), _current_op_seq_layout, backend_layout, false));
     }
   }
@@ -1858,17 +1669,17 @@ void KernelGenerator::visit(const ir::operation::Add &node)
 
   const auto activation = node.param().activation;
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto lhs_alloc = _tensor_builder->at(lhs_index).get();
-  auto rhs_alloc = _tensor_builder->at(rhs_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto lhs_tensor = _tensor_builder->at(lhs_index).get();
+  auto rhs_tensor = _tensor_builder->at(rhs_index).get();
 
   auto fn = std::make_unique<::arm_compute::NEArithmeticAddition>();
 
-  fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(),
+  fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
                 arm_compute::ConvertPolicy::SATURATE);
 
   _return_fn = std::make_unique<exec::FunctionSequence>(
-      asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
+      asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
 }
 
 void KernelGenerator::visit(const ir::operation::Div &node)
@@ -1879,16 +1690,16 @@ void KernelGenerator::visit(const ir::operation::Div &node)
 
   const auto activation = node.param().activation;
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto lhs_alloc = _tensor_builder->at(lhs_index).get();
-  auto rhs_alloc = _tensor_builder->at(rhs_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto lhs_tensor = _tensor_builder->at(lhs_index).get();
+  auto rhs_tensor = _tensor_builder->at(rhs_index).get();
 
   auto fn = std::make_unique<::arm_compute::NEElementwiseDivision>();
 
-  fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
+  fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
 
   _return_fn = std::make_unique<exec::FunctionSequence>(
-      asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
+      asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
 }
 
 void KernelGenerator::visit(const ir::operation::Exp &node)
@@ -1896,12 +1707,12 @@ void KernelGenerator::visit(const ir::operation::Exp &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::Exp::Input::INPUT)};
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input_tensor = _tensor_builder->at(input_index).get();
 
   auto fn = std::make_unique<::arm_compute::NEExpLayer>();
 
-  fn->configure(input_alloc->handle(), output_alloc->handle());
+  fn->configure(input_tensor->handle(), output_tensor->handle());
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -1913,12 +1724,12 @@ void KernelGenerator::visit(const ir::operation::ExpandDims &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::ExpandDims::Input::INPUT)};
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input_tensor = _tensor_builder->at(input_index).get();
 
   auto fn = std::make_unique<::arm_compute::NEReshapeLayer>();
 
-  fn->configure(input_alloc->handle(), output_alloc->handle());
+  fn->configure(input_tensor->handle(), output_tensor->handle());
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -1933,13 +1744,13 @@ void KernelGenerator::visit(const ir::operation::Comparison &node)
 
   const auto comparison_type = node.param().comparison_type;
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input0_alloc = _tensor_builder->at(input0_index).get();
-  auto input1_alloc = _tensor_builder->at(input1_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input0_tensor = _tensor_builder->at(input0_index).get();
+  auto input1_tensor = _tensor_builder->at(input1_index).get();
 
   auto fn = std::make_unique<::arm_compute::NEElementwiseComparison>();
 
-  fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle(),
+  fn->configure(input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle(),
                 (arm_compute::ComparisonOperation)comparison_type);
 
   auto acl_fn = asAclFunction(std::move(fn));
@@ -1953,13 +1764,13 @@ void KernelGenerator::visit(const ir::operation::Min &node)
   const auto lhs_index{node.getInputs().at(ir::operation::Min::Input::LHS)};
   const auto rhs_index{node.getInputs().at(ir::operation::Min::Input::RHS)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto lhs_alloc = _tensor_builder->at(lhs_index).get();
-  auto rhs_alloc = _tensor_builder->at(rhs_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto lhs_tensor = _tensor_builder->at(lhs_index).get();
+  auto rhs_tensor = _tensor_builder->at(rhs_index).get();
 
   auto fn = std::make_unique<::arm_compute::NEElementwiseMin>();
 
-  fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
+  fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -1972,13 +1783,13 @@ void KernelGenerator::visit(const ir::operation::Max &node)
   const auto lhs_index{node.getInputs().at(ir::operation::Max::Input::LHS)};
   const auto rhs_index{node.getInputs().at(ir::operation::Max::Input::RHS)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto lhs_alloc = _tensor_builder->at(lhs_index).get();
-  auto rhs_alloc = _tensor_builder->at(rhs_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto lhs_tensor = _tensor_builder->at(lhs_index).get();
+  auto rhs_tensor = _tensor_builder->at(rhs_index).get();
 
   auto fn = std::make_unique<::arm_compute::NEElementwiseMax>();
 
-  fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
+  fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
 
   auto acl_fn = asAclFunction(std::move(fn));
 
diff --git a/runtime/onert/backend/cpu/ConstantInitializer.cc b/runtime/onert/backend/cpu/ConstantInitializer.cc
index 71e3136..deb27f0 100644
--- a/runtime/onert/backend/cpu/ConstantInitializer.cc
+++ b/runtime/onert/backend/cpu/ConstantInitializer.cc
@@ -15,6 +15,7 @@
  */
 
 #include "ConstantInitializer.h"
+#include "Tensor.h"
 
 namespace onert
 {
@@ -30,39 +31,61 @@ ConstantInitializer::ConstantInitializer(const ir::Operands &operands,
   // DO NOTHING
 }
 
+void ConstantInitializer::registerDefaultInitializer(const ir::OperandIndex &index,
+                                                     const ir::Operand &obj)
+{
+  registerExternalInitializer(index, obj);
+}
+
+void ConstantInitializer::registerExternalInitializer(const ir::OperandIndex &index,
+                                                      const ir::Operand &obj)
+{
+  // For only CONSTANTS
+  // TODO Add to check if tensor has been allocated
+  if (!obj.isConstant())
+    return;
+
+  _init_map[index] = [](const onert::ir::Operand &model_obj, onert::backend::ITensor &itensor) {
+    auto data = model_obj.shareData();
+    assert(data && data->base());
+    ExternalTensor &tensor = dynamic_cast<ExternalTensor &>(itensor);
+    tensor.setData(data);
+  };
+}
+
 void ConstantInitializer::visit(const ir::operation::Conv2D &node)
 {
   const auto &kernel_index = node.getInputs().at(ir::operation::Conv2D::KERNEL);
   const auto &kernel_obj = _operands.at(kernel_index);
-  registerCopyInitializer(kernel_index, kernel_obj);
+  registerExternalInitializer(kernel_index, kernel_obj);
 
   const auto &bias_index = node.getInputs().at(ir::operation::Conv2D::BIAS);
   const auto &bias_obj = _operands.at(bias_index);
-  registerCopyInitializer(bias_index, bias_obj);
+  registerExternalInitializer(bias_index, bias_obj);
 }
 
 void ConstantInitializer::visit(const ir::operation::DepthwiseConv2D &node)
 {
   const auto &kernel_index = node.getInputs().at(ir::operation::DepthwiseConv2D::KERNEL);
   const auto &kernel_obj = _operands.at(kernel_index);
-  registerCopyInitializer(kernel_index, kernel_obj);
+  registerExternalInitializer(kernel_index, kernel_obj);
 
   const auto &bias_index = node.getInputs().at(ir::operation::DepthwiseConv2D::BIAS);
   const auto &bias_obj = _operands.at(bias_index);
-  registerCopyInitializer(bias_index, bias_obj);
+  registerExternalInitializer(bias_index, bias_obj);
 }
 
 void ConstantInitializer::visit(const ir::operation::FullyConnected &node)
 {
   const auto &weight_index = node.getInputs().at(ir::operation::FullyConnected::WEIGHT);
   const auto &weight_obj = _operands.at(weight_index);
-  registerCopyInitializer(weight_index, weight_obj);
+  registerExternalInitializer(weight_index, weight_obj);
 
   const auto &bias_index = node.getInputs().at(ir::operation::FullyConnected::BIAS);
   if (!bias_index.undefined())
   {
     const auto &bias_obj = _operands.at(bias_index);
-    registerCopyInitializer(bias_index, bias_obj);
+    registerExternalInitializer(bias_index, bias_obj);
   }
 }
 
diff --git a/runtime/onert/backend/cpu/ConstantInitializer.h b/runtime/onert/backend/cpu/ConstantInitializer.h
index bd06c64..de03a69 100644
--- a/runtime/onert/backend/cpu/ConstantInitializer.h
+++ b/runtime/onert/backend/cpu/ConstantInitializer.h
@@ -36,6 +36,15 @@ public:
                       const std::shared_ptr<TensorBuilder> &tensor_builder);
 
 public:
+  void registerDefaultInitializer(const ir::OperandIndex &index, const ir::Operand &obj) override;
+
+  // TODO: For now the only cpu backend supports constant tensor to use data from external
+  // If the other backend supports (to do this,
+  // ExternalTensor should be abstract such as IExternal, maybe),
+  // this can be an interface of IConstantInitializer
+  void registerExternalInitializer(const ir::OperandIndex &, const ir::Operand &);
+
+public:
   void visit(const ir::operation::Conv2D &) override;
   void visit(const ir::operation::DepthwiseConv2D &) override;
   void visit(const ir::operation::FullyConnected &) override;
diff --git a/runtime/onert/backend/cpu/KernelGenerator.cc b/runtime/onert/backend/cpu/KernelGenerator.cc
index 72f9606..2766aa2 100644
--- a/runtime/onert/backend/cpu/KernelGenerator.cc
+++ b/runtime/onert/backend/cpu/KernelGenerator.cc
@@ -60,6 +60,7 @@
 #include "ops/SoftMaxLayer.h"
 #include "ops/StridedSliceLayer.h"
 #include "ops/SpaceToBatchNDLayer.h"
+#include "ops/SpaceToDepthLayer.h"
 #include "ops/SplitLayer.h"
 #include "ops/SubLayer.h"
 #include "ops/TanhLayer.h"
@@ -70,11 +71,13 @@
 #include "ops/ZerosLikeLayer.h"
 #include "ops/SquaredDiffLayer.h"
 #include "ops/LogicalOrLayer.h"
+#include "ops/L2NormLayer.h"
 #include "ops/MatrixBandPartLayer.h"
 #include "ops/BatchMatMulLayer.h"
 #include "ops/BroadcastToLayer.h"
 #include "ops/FusedBatchNormLayer.h"
 #include "ops/LogSoftMaxLayer.h"
+#include "ops/QuantizeLayer.h"
 
 #include <backend/Backend.h>
 #include <backend/IConfig.h>
@@ -184,10 +187,10 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node)
   const auto ker_index{node.getInputs().at(Conv2D::Input::KERNEL)};
   const auto bias_index{node.getInputs().at(Conv2D::Input::BIAS)};
 
-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
-  auto ker_alloc = _tensor_builder->portableAt(ker_index).get();
-  auto bias_alloc = _tensor_builder->portableAt(bias_index).get();
+  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
+  auto ker_tensor = _tensor_builder->portableAt(ker_index).get();
+  auto bias_tensor = _tensor_builder->portableAt(bias_index).get();
 
   const auto stride = node.param().stride;
   const auto activation = node.param().activation;
@@ -196,9 +199,9 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node)
 
   if (_ctx.at(ifm_index).info().isDynamic() || _ctx.at(ker_index).info().isDynamic())
   {
-    fn->configure(ifm_alloc, ker_alloc, bias_alloc, param_padding.type, param_padding.param.left,
+    fn->configure(ifm_tensor, ker_tensor, bias_tensor, param_padding.type, param_padding.param.left,
                   param_padding.param.right, param_padding.param.top, param_padding.param.bottom,
-                  stride.horizontal, stride.vertical, activation, ofm_alloc);
+                  stride.horizontal, stride.vertical, activation, ofm_tensor);
 
     _return_fn = std::move(fn);
     return;
@@ -213,9 +216,9 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node)
   const auto padding =
       ir::calculatePadding(param_padding, ifm_shape, ofm_shape, stride, ker_width, ker_height);
 
-  fn->configure(ifm_alloc, ker_alloc, bias_alloc, param_padding.type, padding.left, padding.right,
-                padding.top, padding.bottom, stride.horizontal, stride.vertical, activation,
-                ofm_alloc);
+  fn->configure(ifm_tensor, ker_tensor, bias_tensor, param_padding.type, padding.left,
+                padding.right, padding.top, padding.bottom, stride.horizontal, stride.vertical,
+                activation, ofm_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -241,16 +244,16 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
   const auto multiplier = node.param().multiplier;
   const auto activation = node.param().activation;
 
-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
-  auto ker_alloc = _tensor_builder->portableAt(ker_index).get();
-  auto bias_alloc = _tensor_builder->portableAt(bias_index).get();
+  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
+  auto ker_tensor = _tensor_builder->portableAt(ker_index).get();
+  auto bias_tensor = _tensor_builder->portableAt(bias_index).get();
 
   auto fn = std::make_unique<ops::DepthwiseConvolutionLayer>();
 
-  fn->configure(ifm_alloc, ker_alloc, bias_alloc, padding.left, padding.right, padding.top,
+  fn->configure(ifm_tensor, ker_tensor, bias_tensor, padding.left, padding.right, padding.top,
                 padding.bottom, stride.horizontal, stride.vertical, multiplier, activation,
-                ofm_alloc);
+                ofm_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -270,13 +273,13 @@ void KernelGenerator::visit(const ir::operation::MaxPool2D &node)
       ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh);
   const auto activation = node.param().activation;
 
-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
 
   auto fn = std::make_unique<ops::MaxPoolLayer>();
 
-  fn->configure(ifm_alloc, padding.left, padding.right, padding.top, padding.bottom,
-                stride.horizontal, stride.vertical, kw, kh, activation, ofm_alloc);
+  fn->configure(ifm_tensor, padding.left, padding.right, padding.top, padding.bottom,
+                stride.horizontal, stride.vertical, kw, kh, activation, ofm_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -295,13 +298,13 @@ void KernelGenerator::visit(const ir::operation::AvgPool2D &node)
       ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh);
   const auto activation = node.param().activation;
 
-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
 
   auto fn = std::make_unique<ops::AvgPoolLayer>();
 
-  fn->configure(ifm_alloc, padding.left, padding.right, padding.top, padding.bottom,
-                stride.horizontal, stride.vertical, kw, kh, activation, ofm_alloc);
+  fn->configure(ifm_tensor, padding.left, padding.right, padding.top, padding.bottom,
+                stride.horizontal, stride.vertical, kw, kh, activation, ofm_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -313,7 +316,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
   const auto rank = _ctx.at(ofm_index).shape().rank();
   const auto axis = ops::getAxis(rank, node.param().axis, _current_op_seq_layout);
 
-  auto output_alloc = _tensor_builder->portableAt(ofm_index).get();
+  auto output_tensor = _tensor_builder->portableAt(ofm_index).get();
 
   std::vector<const IPortableTensor *> input_tensors;
   for (auto &ifm_idx : node.getInputs())
@@ -321,7 +324,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
 
   auto fn = std::make_unique<ops::ConcatLayer>();
 
-  fn->configure(input_tensors, axis, output_alloc);
+  fn->configure(input_tensors, axis, output_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -332,13 +335,13 @@ void KernelGenerator::visit(const ir::operation::Fill &node)
   const auto input_index{node.getInputs().at(ir::operation::Fill::Input::INPUT)};
   const auto value_index{node.getInputs().at(ir::operation::Fill::Input::VALUE)};
 
-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
-  auto value_alloc = _tensor_builder->portableAt(value_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto input_tensor = _tensor_builder->portableAt(input_index).get();
+  auto value_tensor = _tensor_builder->portableAt(value_index).get();
 
   auto fn = std::make_unique<ops::FillLayer>();
 
-  fn->configure(input_alloc, value_alloc, output_alloc);
+  fn->configure(input_tensor, value_tensor, output_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -353,15 +356,15 @@ void KernelGenerator::visit(const ir::operation::FullyConnected &node)
   const auto bias_index{node.getInputs().at(FullyConnected::Input::BIAS)};
   const auto activation = node.param().activation;
 
-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
-  auto weight_alloc = _tensor_builder->portableAt(weight_index).get();
-  auto bias_alloc =
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto input_tensor = _tensor_builder->portableAt(input_index).get();
+  auto weight_tensor = _tensor_builder->portableAt(weight_index).get();
+  auto bias_tensor =
       bias_index.undefined() ? nullptr : _tensor_builder->portableAt(bias_index).get();
 
   auto fn = std::make_unique<ops::FullyConnectedLayer>();
 
-  fn->configure(input_alloc, weight_alloc, bias_alloc, activation, output_alloc);
+  fn->configure(input_tensor, weight_tensor, bias_tensor, activation, output_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -371,21 +374,21 @@ void KernelGenerator::visit(const ir::operation::Reshape &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)};
 
-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto input_tensor = _tensor_builder->portableAt(input_index).get();
 
   // optional 2nd input
-  IPortableTensor *shape_alloc = nullptr;
+  IPortableTensor *shape_tensor = nullptr;
 
   if (node.getInputs().size() == 2)
   {
     const auto shape_index{node.getInputs().at(ir::operation::Reshape::Input::SHAPE)};
-    shape_alloc = _tensor_builder->portableAt(shape_index).get();
+    shape_tensor = _tensor_builder->portableAt(shape_index).get();
   }
 
   auto fn = std::make_unique<ops::ReshapeLayer>();
 
-  fn->configure(input_alloc, shape_alloc, output_alloc);
+  fn->configure(input_tensor, shape_tensor, output_tensor);
   _return_fn = std::move(fn);
 }
 
@@ -394,13 +397,13 @@ void KernelGenerator::visit(const ir::operation::Squeeze &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::Squeeze::Input::INPUT)};
 
-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto input_tensor = _tensor_builder->portableAt(input_index).get();
 
   // Squeeze can share same kernel with reshape
   auto fn = std::make_unique<ops::ReshapeLayer>();
 
-  fn->configure(input_alloc, nullptr, output_alloc);
+  fn->configure(input_tensor, nullptr, output_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -412,12 +415,12 @@ void KernelGenerator::visit(const ir::operation::Softmax &node)
 
   const auto beta = node.param().beta;
 
-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto input_tensor = _tensor_builder->portableAt(input_index).get();
 
   auto fn = std::make_unique<ops::SoftMaxLayer>();
 
-  fn->configure(input_alloc, beta, output_alloc);
+  fn->configure(input_tensor, beta, output_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -430,13 +433,13 @@ void KernelGenerator::visit(const ir::operation::Add &node)
 
   const auto activation = node.param().activation;
 
-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
-  auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get();
-  auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get();
+  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+  auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get();
+  auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get();
 
   auto fn = std::make_unique<ops::AddLayer>();
 
-  fn->configure(lhs_alloc, rhs_alloc, activation, ofm_alloc);
+  fn->configure(lhs_tensor, rhs_tensor, activation, ofm_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -447,15 +450,15 @@ void KernelGenerator::visit(const ir::operation::Comparison &node)
   const auto lhs_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT0)};
   const auto rhs_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT1)};
 
-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
-  auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get();
-  auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get();
+  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+  auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get();
+  auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get();
 
   auto comparison_type = node.param().comparison_type;
 
   auto fn = std::make_unique<ops::CompareLayer>();
 
-  fn->configure(lhs_alloc, rhs_alloc, comparison_type, ofm_alloc);
+  fn->configure(lhs_tensor, rhs_tensor, comparison_type, ofm_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -466,11 +469,11 @@ void KernelGenerator::visit(const ir::operation::Gather &node)
   const auto input_index{node.getInputs().at(ir::operation::Gather::Input::INPUT)};
   const auto indices_index{node.getInputs().at(ir::operation::Gather::Input::INDICES)};
 
-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
-  auto indices_alloc = _tensor_builder->portableAt(indices_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto input_tensor = _tensor_builder->portableAt(input_index).get();
+  auto indices_tensor = _tensor_builder->portableAt(indices_index).get();
 
-  const auto backend_layout = output_alloc->layout();
+  const auto backend_layout = output_tensor->layout();
   UNUSED_RELEASE(backend_layout);
 
   // NOTE The frontend layout and backend layout must be the same for this operation.
@@ -481,8 +484,8 @@ void KernelGenerator::visit(const ir::operation::Gather &node)
   //      a model. For example, if a model in NHWC has this operation as output rank == 4, indices
   //      rank == 2 and axis == 2, this operation should work as the axis W and C, but the axis W
   //      and C are not sequential in NCHW. So the backend in NCHW cannot handle this case.
-  assert(backend_layout == input_alloc->layout());
-  assert(backend_layout == indices_alloc->layout());
+  assert(backend_layout == input_tensor->layout());
+  assert(backend_layout == indices_tensor->layout());
   const auto &input_shape = _ctx.at(input_index).shape();
   UNUSED_RELEASE(input_shape);
   assert(input_shape.rank() < 4 || _current_op_seq_layout == backend_layout);
@@ -492,7 +495,7 @@ void KernelGenerator::visit(const ir::operation::Gather &node)
 
   auto fn = std::make_unique<ops::GatherLayer>();
 
-  fn->configure(input_alloc, indices_alloc, output_alloc, axis_value);
+  fn->configure(input_tensor, indices_tensor, output_tensor, axis_value);
 
   _return_fn = std::move(fn);
 }
@@ -506,13 +509,13 @@ void KernelGenerator::visit(const ir::operation::Sub &node)
 
   const auto activation = node.param().activation;
 
-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
-  auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get();
-  auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get();
+  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+  auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get();
+  auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get();
 
   auto fn = std::make_unique<ops::SubLayer>();
 
-  fn->configure(lhs_alloc, rhs_alloc, activation, ofm_alloc);
+  fn->configure(lhs_tensor, rhs_tensor, activation, ofm_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -526,13 +529,13 @@ void KernelGenerator::visit(const ir::operation::Mul &node)
 
   const auto activation = node.param().activation;
 
-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
-  auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get();
-  auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get();
+  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+  auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get();
+  auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get();
 
   auto fn = std::make_unique<ops::MulLayer>();
 
-  fn->configure(lhs_alloc, rhs_alloc, activation, ofm_alloc);
+  fn->configure(lhs_tensor, rhs_tensor, activation, ofm_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -547,18 +550,18 @@ void KernelGenerator::visit(const ir::operation::OneHot &node)
 
   const auto axis = node.param().axis;
 
-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
-  auto indices_alloc = _tensor_builder->portableAt(indices_index).get();
-  auto depth_alloc = _tensor_builder->portableAt(depth_index).get();
-  auto onvalue_alloc = _tensor_builder->portableAt(onvalue_index).get();
-  auto offvalue_alloc = _tensor_builder->portableAt(offvalue_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto indices_tensor = _tensor_builder->portableAt(indices_index).get();
+  auto depth_tensor = _tensor_builder->portableAt(depth_index).get();
+  auto onvalue_tensor = _tensor_builder->portableAt(onvalue_index).get();
+  auto offvalue_tensor = _tensor_builder->portableAt(offvalue_index).get();
 
-  assert(indices_alloc->data_type() == OperandType::INT32);
-  assert(axis <= static_cast<int>(indices_alloc->num_dimensions()));
+  assert(indices_tensor->data_type() == OperandType::INT32);
+  assert(axis <= static_cast<int>(indices_tensor->num_dimensions()));
 
   auto fn = std::make_unique<ops::OneHotLayer>();
 
-  fn->configure(indices_alloc, depth_alloc, onvalue_alloc, offvalue_alloc, output_alloc, axis);
+  fn->configure(indices_tensor, depth_tensor, onvalue_tensor, offvalue_tensor, output_tensor, axis);
 
   _return_fn = std::move(fn);
 }
@@ -572,13 +575,13 @@ void KernelGenerator::visit(const ir::operation::Div &node)
 
   const auto activation = node.param().activation;
 
-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
-  auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get();
-  auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get();
+  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+  auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get();
+  auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get();
 
   auto fn = std::make_unique<ops::DivLayer>();
 
-  fn->configure(lhs_alloc, rhs_alloc, activation, ofm_alloc);
+  fn->configure(lhs_tensor, rhs_tensor, activation, ofm_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -587,16 +590,16 @@ void KernelGenerator::visit(const ir::operation::Einsum &node)
 {
   const auto ofm_index{node.getOutputs().at(0)};
 
-  auto output_alloc = _tensor_builder->portableAt(ofm_index).get();
-  std::vector<const IPortableTensor *> input_allocs;
+  auto output_tensor = _tensor_builder->portableAt(ofm_index).get();
+  std::vector<const IPortableTensor *> input_tensors;
   for (auto &ifm_idx : node.getInputs())
-    input_allocs.emplace_back(_tensor_builder->portableAt(ifm_idx).get());
+    input_tensors.emplace_back(_tensor_builder->portableAt(ifm_idx).get());
 
   const auto equation = node.param().equation;
 
   auto fn = std::make_unique<ops::EinsumLayer>();
 
-  fn->configure(input_allocs, equation, output_alloc);
+  fn->configure(input_tensors, equation, output_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -605,14 +608,14 @@ void KernelGenerator::visit(const ir::operation::Custom &node)
 {
   auto fill_op_info = [&](const ir::OperandIndexSequence &opSeq,
                           std::vector<custom::TypeInfo> &types,
-                          std::vector<std::shared_ptr<IPortableTensor>> &allocs) {
+                          std::vector<std::shared_ptr<IPortableTensor>> &tensors) {
     for (auto &idx : opSeq)
     {
       const auto &operand = _ctx.at(idx);
       // TODO make sure using `_current_op_seq_layout` is correct for custom operations
       types.emplace_back(custom::TypeInfo{operand.shape(), operand.typeInfo().type()});
-      auto in_alloc = _tensor_builder->portableAt(idx);
-      allocs.emplace_back(in_alloc);
+      auto in_tensor = _tensor_builder->portableAt(idx);
+      tensors.emplace_back(in_tensor);
     }
   };
 
@@ -634,12 +637,12 @@ void KernelGenerator::visit(const ir::operation::Exp &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::Exp::Input::INPUT)};
 
-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto input_tensor = _tensor_builder->portableAt(input_index).get();
 
   auto fn = std::make_unique<ops::ExpLayer>();
 
-  fn->configure(input_alloc, output_alloc);
+  fn->configure(input_tensor, output_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -650,13 +653,13 @@ void KernelGenerator::visit(const ir::operation::ExpandDims &node)
   const auto input_index{node.getInputs().at(ir::operation::ExpandDims::Input::INPUT)};
   const auto axis_index{node.getInputs().at(ir::operation::ExpandDims::Input::AXIS)};
 
-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
-  auto axis_alloc = _tensor_builder->portableAt(axis_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto input_tensor = _tensor_builder->portableAt(input_index).get();
+  auto axis_tensor = _tensor_builder->portableAt(axis_index).get();
 
   auto fn = std::make_unique<ops::ExpandDimsLayer>();
 
-  fn->configure(input_alloc, axis_alloc, output_alloc);
+  fn->configure(input_tensor, axis_tensor, output_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -666,12 +669,12 @@ void KernelGenerator::visit(const ir::operation::Logistic &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::Logistic::Input::INPUT)};
 
-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto input_tensor = _tensor_builder->portableAt(input_index).get();
 
   auto fn = std::make_unique<ops::LogisticLayer>();
 
-  fn->configure(input_alloc, output_alloc);
+  fn->configure(input_tensor, output_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -681,12 +684,12 @@ void KernelGenerator::visit(const ir::operation::Tanh &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::Tanh::Input::INPUT)};
 
-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto input_tensor = _tensor_builder->portableAt(input_index).get();
 
   auto fn = std::make_unique<ops::TanhLayer>();
 
-  fn->configure(input_alloc, output_alloc);
+  fn->configure(input_tensor, output_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -700,7 +703,7 @@ void KernelGenerator::visit(const ir::operation::Pack &node)
 
   assert(-rank <= axis && axis < rank);
 
-  auto output_alloc = _tensor_builder->portableAt(ofm_index).get();
+  auto output_tensor = _tensor_builder->portableAt(ofm_index).get();
 
   std::vector<const IPortableTensor *> input_tensors;
   for (auto &ifm_idx : node.getInputs())
@@ -708,7 +711,7 @@ void KernelGenerator::visit(const ir::operation::Pack &node)
 
   auto fn = std::make_unique<ops::PackLayer>();
 
-  fn->configure(input_tensors, axis, output_alloc);
+  fn->configure(input_tensors, axis, output_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -722,7 +725,7 @@ void KernelGenerator::visit(const ir::operation::Unpack &node)
 
   assert(rank == 0 || (-rank <= axis && axis < rank));
 
-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
+  auto input_tensor = _tensor_builder->portableAt(input_index).get();
 
   std::vector<IPortableTensor *> output_tensors;
   for (auto &output_idx : node.getOutputs())
@@ -732,7 +735,7 @@ void KernelGenerator::visit(const ir::operation::Unpack &node)
 
   uint32_t axis_resolved = (axis < 0 ? axis + rank : axis);
 
-  fn->configure(input_alloc, axis_resolved, node.param().num, output_tensors);
+  fn->configure(input_tensor, axis_resolved, node.param().num, output_tensors);
 
   _return_fn = std::move(fn);
 }
@@ -751,8 +754,16 @@ void KernelGenerator::visit(const ir::operation::Pad &node)
 
   auto fn = std::make_unique<ops::PadLayer>();
 
-  fn->configure(input, output, pad_base, pad_rank);
+  bool isPadV2 = node.getInputs().size() == 3 ? true : false;
+  const void *value = nullptr;
 
+  if (isPadV2)
+  {
+    const auto value_index{node.getInputs().at(ir::operation::Pad::Input::VALUE)};
+    value = reinterpret_cast<const void *>(_ctx.at(value_index).data()->base());
+  }
+
+  fn->configure(input, output, pad_base, pad_rank, value);
   _return_fn = std::move(fn);
 }
 
@@ -762,13 +773,13 @@ void KernelGenerator::visit(const ir::operation::Max &node)
   const auto lhs_index{node.getInputs().at(ir::operation::Max::Input::LHS)};
   const auto rhs_index{node.getInputs().at(ir::operation::Max::Input::RHS)};
 
-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
-  auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get();
-  auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get();
+  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+  auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get();
+  auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get();
 
   auto fn = std::make_unique<ops::MaxLayer>();
 
-  fn->configure(lhs_alloc, rhs_alloc, ofm_alloc);
+  fn->configure(lhs_tensor, rhs_tensor, ofm_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -779,13 +790,13 @@ void KernelGenerator::visit(const ir::operation::Min &node)
   const auto lhs_index{node.getInputs().at(ir::operation::Min::Input::LHS)};
   const auto rhs_index{node.getInputs().at(ir::operation::Min::Input::RHS)};
 
-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
-  auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get();
-  auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get();
+  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+  auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get();
+  auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get();
 
   auto fn = std::make_unique<ops::MinLayer>();
 
-  fn->configure(lhs_alloc, rhs_alloc, ofm_alloc);
+  fn->configure(lhs_tensor, rhs_tensor, ofm_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -795,12 +806,12 @@ void KernelGenerator::visit(const ir::operation::Cast &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::Cast::Input::INPUT)};
 
-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
 
   auto fn = std::make_unique<ops::CastLayer>();
 
-  fn->configure(ifm_alloc, ofm_alloc);
+  fn->configure(ifm_tensor, ofm_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -810,12 +821,12 @@ void KernelGenerator::visit(const ir::operation::Transpose &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::Transpose::Input::INPUT)};
 
-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto input_tensor = _tensor_builder->portableAt(input_index).get();
 
   auto fn = std::make_unique<ops::TransposeLayer>();
 
-  fn->configure(input_alloc, output_alloc, node.param().perm);
+  fn->configure(input_tensor, output_tensor, node.param().perm);
 
   _return_fn = std::move(fn);
 }
@@ -827,15 +838,15 @@ void KernelGenerator::visit(const ir::operation::Reduce &node)
   const auto axes_index{node.getInputs().at(ir::operation::Reduce::Input::AXES)};
 
   const auto keep_dims = node.param().keep_dims;
-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
-  auto axes_alloc = _tensor_builder->portableAt(axes_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto input_tensor = _tensor_builder->portableAt(input_index).get();
+  auto axes_tensor = _tensor_builder->portableAt(axes_index).get();
 
   if (node.param().reduce_type == ir::operation::Reduce::ReduceType::MEAN)
   {
     auto fn = std::make_unique<ops::MeanLayer>();
 
-    fn->configure(input_alloc, axes_alloc, output_alloc, keep_dims);
+    fn->configure(input_tensor, axes_tensor, output_tensor, keep_dims);
 
     _return_fn = std::move(fn);
   }
@@ -844,7 +855,7 @@ void KernelGenerator::visit(const ir::operation::Reduce &node)
     auto fn = std::make_unique<ops::ReduceLayer>();
 
     const auto reduce_type = convertReduceType(node.param().reduce_type);
-    fn->configure(input_alloc, axes_alloc, output_alloc, reduce_type, keep_dims);
+    fn->configure(input_tensor, axes_tensor, output_tensor, reduce_type, keep_dims);
 
     _return_fn = std::move(fn);
   }
@@ -855,12 +866,12 @@ void KernelGenerator::visit(const ir::operation::ReLU &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(0)};
 
-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto input_tensor = _tensor_builder->portableAt(input_index).get();
 
   auto fn = std::make_unique<ops::ReLULayer>();
 
-  fn->configure(input_alloc, output_alloc);
+  fn->configure(input_tensor, output_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -872,14 +883,14 @@ void KernelGenerator::visit(const ir::operation::Select &node)
   const auto true_index{node.getInputs().at(ir::operation::Select::Input::INPUT_TRUE)};
   const auto false_index{node.getInputs().at(ir::operation::Select::Input::INPUT_FALSE)};
 
-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
-  auto condition_alloc = _tensor_builder->portableAt(condition_index).get();
-  auto true_alloc = _tensor_builder->portableAt(true_index).get();
-  auto false_alloc = _tensor_builder->portableAt(false_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto condition_tensor = _tensor_builder->portableAt(condition_index).get();
+  auto true_tensor = _tensor_builder->portableAt(true_index).get();
+  auto false_tensor = _tensor_builder->portableAt(false_index).get();
 
   auto fn = std::make_unique<ops::SelectLayer>();
 
-  fn->configure(condition_alloc, true_alloc, false_alloc, output_alloc);
+  fn->configure(condition_tensor, true_tensor, false_tensor, output_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -891,14 +902,14 @@ void KernelGenerator::visit(const ir::operation::Slice &node)
   const auto begins_index{node.getInputs().at(ir::operation::Slice::Input::BEGINS)};
   const auto sizes_index{node.getInputs().at(ir::operation::Slice::Input::SIZES)};
 
-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
-  auto begins_alloc = _tensor_builder->portableAt(begins_index).get();
-  auto sizes_alloc = _tensor_builder->portableAt(sizes_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto input_tensor = _tensor_builder->portableAt(input_index).get();
+  auto begins_tensor = _tensor_builder->portableAt(begins_index).get();
+  auto sizes_tensor = _tensor_builder->portableAt(sizes_index).get();
 
   auto fn = std::make_unique<ops::SliceLayer>();
 
-  fn->configure(input_alloc, begins_alloc, sizes_alloc, output_alloc);
+  fn->configure(input_tensor, begins_tensor, sizes_tensor, output_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -911,11 +922,11 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node)
   const auto ends_index{node.getInputs().at(ir::operation::StridedSlice::Input::ENDS)};
   const auto strides_index{node.getInputs().at(ir::operation::StridedSlice::Input::STRIDES)};
 
-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
-  auto starts_alloc = _tensor_builder->portableAt(starts_index).get();
-  auto ends_alloc = _tensor_builder->portableAt(ends_index).get();
-  auto strides_alloc = _tensor_builder->portableAt(strides_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto input_tensor = _tensor_builder->portableAt(input_index).get();
+  auto starts_tensor = _tensor_builder->portableAt(starts_index).get();
+  auto ends_tensor = _tensor_builder->portableAt(ends_index).get();
+  auto strides_tensor = _tensor_builder->portableAt(strides_index).get();
 
   auto begin_mask = node.param().begin_mask;
   auto end_mask = node.param().end_mask;
@@ -923,7 +934,7 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node)
 
   auto fn = std::make_unique<ops::StridedSliceLayer>();
 
-  fn->configure(input_alloc, starts_alloc, ends_alloc, strides_alloc, output_alloc, begin_mask,
+  fn->configure(input_tensor, starts_tensor, ends_tensor, strides_tensor, output_tensor, begin_mask,
                 end_mask, shrink_axis_mask);
 
   _return_fn = std::move(fn);
@@ -957,12 +968,12 @@ void KernelGenerator::visit(const ir::operation::Abs &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::Abs::Input::INPUT)};
 
-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
 
   auto fn = std::make_unique<ops::AbsLayer>();
 
-  fn->configure(ifm_alloc, ofm_alloc);
+  fn->configure(ifm_tensor, ofm_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -972,12 +983,12 @@ void KernelGenerator::visit(const ir::operation::Sin &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::Sin::Input::INPUT)};
 
-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
 
   auto fn = std::make_unique<ops::SinLayer>();
 
-  fn->configure(ifm_alloc, ofm_alloc);
+  fn->configure(ifm_tensor, ofm_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -987,12 +998,12 @@ void KernelGenerator::visit(const ir::operation::Cos &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::Cos::Input::INPUT)};
 
-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
 
   auto fn = std::make_unique<ops::CosLayer>();
 
-  fn->configure(ifm_alloc, ofm_alloc);
+  fn->configure(ifm_tensor, ofm_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -1002,12 +1013,12 @@ void KernelGenerator::visit(const ir::operation::RSQRT &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::RSQRT::Input::INPUT)};
 
-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
 
   auto fn = std::make_unique<ops::RsqrtLayer>();
 
-  fn->configure(ifm_alloc, ofm_alloc);
+  fn->configure(ifm_tensor, ofm_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -1017,12 +1028,12 @@ void KernelGenerator::visit(const ir::operation::Shape &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::Shape::Input::INPUT)};
 
-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
 
   auto fn = std::make_unique<ops::ShapeLayer>();
 
-  fn->configure(ifm_alloc, ofm_alloc);
+  fn->configure(ifm_tensor, ofm_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -1033,13 +1044,13 @@ void KernelGenerator::visit(const ir::operation::Reverse &node)
   const auto input_index{node.getInputs().at(ir::operation::Reverse::INPUT)};
   const auto axis_index{node.getInputs().at(ir::operation::Reverse::AXIS)};
 
-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
-  auto axis_alloc = _tensor_builder->portableAt(axis_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto input_tensor = _tensor_builder->portableAt(input_index).get();
+  auto axis_tensor = _tensor_builder->portableAt(axis_index).get();
 
   auto fn = std::make_unique<ops::ReverseLayer>();
 
-  fn->configure(input_alloc, axis_alloc, output_alloc);
+  fn->configure(input_tensor, axis_tensor, output_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -1049,12 +1060,12 @@ void KernelGenerator::visit(const ir::operation::Neg &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::Neg::Input::INPUT)};
 
-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
 
   auto fn = std::make_unique<ops::NegLayer>();
 
-  fn->configure(ifm_alloc, ofm_alloc);
+  fn->configure(ifm_tensor, ofm_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -1066,12 +1077,12 @@ void KernelGenerator::visit(const ir::operation::ArgMax &node)
 
   const auto axis = node.param().axis;
 
-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto input_tensor = _tensor_builder->portableAt(input_index).get();
 
   auto fn = std::make_unique<ops::ArgMinMaxLayer>();
 
-  fn->configure(input_alloc, output_alloc, axis, /* is_arg_max */ true);
+  fn->configure(input_tensor, output_tensor, axis, /* is_arg_max */ true);
 
   _return_fn = std::move(fn);
 }
@@ -1082,13 +1093,13 @@ void KernelGenerator::visit(const ir::operation::Pow &node)
   const auto lhs_index{node.getInputs().at(ir::operation::Pow::LHS)};
   const auto rhs_index{node.getInputs().at(ir::operation::Pow::RHS)};
 
-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
-  auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get();
-  auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get();
+  auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get();
 
   auto fn = std::make_unique<ops::PowLayer>();
 
-  fn->configure(lhs_alloc, rhs_alloc, ir::Activation::NONE, output_alloc);
+  fn->configure(lhs_tensor, rhs_tensor, ir::Activation::NONE, output_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -1098,12 +1109,12 @@ void KernelGenerator::visit(const ir::operation::Log &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::Log::Input::INPUT)};
 
-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
 
   auto fn = std::make_unique<ops::LogLayer>();
 
-  fn->configure(ifm_alloc, ofm_alloc);
+  fn->configure(ifm_tensor, ofm_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -1113,12 +1124,12 @@ void KernelGenerator::visit(const ir::operation::Round &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::Round::INPUT)};
 
-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto input_tensor = _tensor_builder->portableAt(input_index).get();
 
   auto fn = std::make_unique<ops::RoundLayer>();
 
-  fn->configure(input_alloc, output_alloc);
+  fn->configure(input_tensor, output_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -1128,12 +1139,12 @@ void KernelGenerator::visit(const ir::operation::LogicalNot &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::LogicalNot::INPUT)};
 
-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto input_tensor = _tensor_builder->portableAt(input_index).get();
 
   auto fn = std::make_unique<ops::LogicalNotLayer>();
 
-  fn->configure(input_alloc, output_alloc);
+  fn->configure(input_tensor, output_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -1144,28 +1155,43 @@ void KernelGenerator::visit(const ir::operation::LogicalOr &node)
   const auto lhs_index{node.getInputs().at(0)};
   const auto rhs_index{node.getInputs().at(1)};
 
-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
-  auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get();
-  auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get();
+  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+  auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get();
+  auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get();
 
   auto fn = std::make_unique<ops::LogicalOrLayer>();
 
-  fn->configure(lhs_alloc, rhs_alloc, ofm_alloc);
+  fn->configure(lhs_tensor, rhs_tensor, ofm_tensor);
 
   _return_fn = std::move(fn);
 }
 
-void KernelGenerator::visit(const ir::operation::ZerosLike &node)
+void KernelGenerator::visit(const ir::operation::L2Normalization &node)
 {
   const auto output_index{node.getOutputs().at(0)};
-  const auto input_index{node.getInputs().at(ir::operation::ZerosLike::INPUT)};
+  const auto input_index{node.getInputs().at(0)};
 
   auto output_alloc = _tensor_builder->portableAt(output_index).get();
   auto input_alloc = _tensor_builder->portableAt(input_index).get();
 
-  auto fn = std::make_unique<ops::ZerosLikeLayer>();
+  auto fn = std::make_unique<ops::L2NormLayer>();
 
   fn->configure(input_alloc, output_alloc);
+
+  _return_fn = std::move(fn);
+}
+
+void KernelGenerator::visit(const ir::operation::ZerosLike &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(ir::operation::ZerosLike::INPUT)};
+
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto input_tensor = _tensor_builder->portableAt(input_index).get();
+
+  auto fn = std::make_unique<ops::ZerosLikeLayer>();
+
+  fn->configure(input_tensor, output_tensor);
   _return_fn = std::move(fn);
 }
 
@@ -1176,14 +1202,14 @@ void KernelGenerator::visit(const ir::operation::Range &node)
   const auto limit_index{node.getInputs().at(ir::operation::Range::LIMIT)};
   const auto delta_index{node.getInputs().at(ir::operation::Range::DELTA)};
 
-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
-  auto start_alloc = _tensor_builder->portableAt(start_index).get();
-  auto limit_alloc = _tensor_builder->portableAt(limit_index).get();
-  auto delta_alloc = _tensor_builder->portableAt(delta_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto start_tensor = _tensor_builder->portableAt(start_index).get();
+  auto limit_tensor = _tensor_builder->portableAt(limit_index).get();
+  auto delta_tensor = _tensor_builder->portableAt(delta_index).get();
 
   auto fn = std::make_unique<ops::RangeLayer>();
 
-  fn->configure(start_alloc, limit_alloc, delta_alloc, output_alloc);
+  fn->configure(start_tensor, limit_tensor, delta_tensor, output_tensor);
   _return_fn = std::move(fn);
 }
 
@@ -1193,13 +1219,13 @@ void KernelGenerator::visit(const ir::operation::SquaredDifference &node)
   const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)};
   const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)};
 
-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
-  auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get();
-  auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get();
+  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+  auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get();
+  auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get();
 
   auto fn = std::make_unique<ops::SqDiffLayer>();
 
-  fn->configure(lhs_alloc, rhs_alloc, ofm_alloc);
+  fn->configure(lhs_tensor, rhs_tensor, ofm_tensor);
   _return_fn = std::move(fn);
 }
 
@@ -1209,13 +1235,13 @@ void KernelGenerator::visit(const ir::operation::Tile &node)
   const auto input_index{node.getInputs().at(ir::operation::Tile::INPUT)};
   const auto multiples_index{node.getInputs().at(ir::operation::Tile::MULTIPLES)};
 
-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
-  auto multiples_alloc = _tensor_builder->portableAt(multiples_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto input_tensor = _tensor_builder->portableAt(input_index).get();
+  auto multiples_tensor = _tensor_builder->portableAt(multiples_index).get();
 
   auto fn = std::make_unique<ops::TileLayer>();
 
-  fn->configure(input_alloc, multiples_alloc, output_alloc);
+  fn->configure(input_tensor, multiples_tensor, output_tensor);
   _return_fn = std::move(fn);
 }
 
@@ -1226,14 +1252,14 @@ void KernelGenerator::visit(const ir::operation::MatrixBandPart &node)
   const auto num_lower_index{node.getInputs().at(ir::operation::MatrixBandPart::NUM_LOWER_DIAG)};
   const auto num_upper_index{node.getInputs().at(ir::operation::MatrixBandPart::NUM_UPPER_DIAG)};
 
-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
-  auto num_lower_alloc = _tensor_builder->portableAt(num_lower_index).get();
-  auto num_upper_alloc = _tensor_builder->portableAt(num_upper_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto input_tensor = _tensor_builder->portableAt(input_index).get();
+  auto num_lower_tensor = _tensor_builder->portableAt(num_lower_index).get();
+  auto num_upper_tensor = _tensor_builder->portableAt(num_upper_index).get();
 
   auto fn = std::make_unique<ops::MatrixBandPartLayer>();
 
-  fn->configure(input_alloc, num_lower_alloc, num_upper_alloc, output_alloc);
+  fn->configure(input_tensor, num_lower_tensor, num_upper_tensor, output_tensor);
   _return_fn = std::move(fn);
 }
 
@@ -1243,16 +1269,16 @@ void KernelGenerator::visit(const ir::operation::BatchMatMul &node)
   const auto lhs_index{node.getInputs().at(ir::operation::BatchMatMul::LHS)};
   const auto rhs_index{node.getInputs().at(ir::operation::BatchMatMul::RHS)};
 
-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
-  auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get();
-  auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get();
+  auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get();
 
   const auto adj_x = node.param().adj_x;
   const auto adj_y = node.param().adj_y;
 
   auto fn = std::make_unique<ops::BatchMatMulLayer>();
 
-  fn->configure(lhs_alloc, rhs_alloc, adj_x, adj_y, output_alloc);
+  fn->configure(lhs_tensor, rhs_tensor, adj_x, adj_y, output_tensor);
   _return_fn = std::move(fn);
 }
 
@@ -1262,13 +1288,13 @@ void KernelGenerator::visit(const ir::operation::BroadcastTo &node)
   const auto input_index{node.getInputs().at(ir::operation::BroadcastTo::INPUT)};
   const auto shape_index{node.getInputs().at(ir::operation::BroadcastTo::SHAPE)};
 
-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
-  auto shape_alloc = _tensor_builder->portableAt(shape_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto input_tensor = _tensor_builder->portableAt(input_index).get();
+  auto shape_tensor = _tensor_builder->portableAt(shape_index).get();
 
   auto fn = std::make_unique<ops::BroadcastToLayer>();
 
-  fn->configure(input_alloc, shape_alloc, output_alloc);
+  fn->configure(input_tensor, shape_tensor, output_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -1277,10 +1303,10 @@ void KernelGenerator::visit(const ir::operation::FusedBatchNorm &node)
 {
   const auto ofm_index{node.getOutputs().at(0)};
 
-  auto output_alloc = _tensor_builder->portableAt(ofm_index).get();
-  std::vector<const IPortableTensor *> input_allocs;
+  auto output_tensor = _tensor_builder->portableAt(ofm_index).get();
+  std::vector<const IPortableTensor *> input_tensors;
   for (auto &ifm_idx : node.getInputs())
-    input_allocs.emplace_back(_tensor_builder->portableAt(ifm_idx).get());
+    input_tensors.emplace_back(_tensor_builder->portableAt(ifm_idx).get());
 
   const auto epsilon = node.param().epsilon;
   const auto is_training = node.param().is_training;
@@ -1288,7 +1314,7 @@ void KernelGenerator::visit(const ir::operation::FusedBatchNorm &node)
 
   auto fn = std::make_unique<ops::FusedBatchNormLayer>();
 
-  fn->configure(input_allocs, epsilon, is_training, data_format, output_alloc);
+  fn->configure(input_tensors, epsilon, is_training, data_format, output_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -1301,12 +1327,12 @@ void KernelGenerator::visit(const ir::operation::LogSoftmax &node)
   const auto beta = node.param().beta;
   const auto axis = node.param().axis;
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto input_tensor = _tensor_builder->portableAt(input_index).get();
 
   auto fn = std::make_unique<ops::LogSoftMaxLayer>();
 
-  fn->configure(input_alloc, beta, axis, output_alloc);
+  fn->configure(input_tensor, beta, axis, output_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -1318,14 +1344,45 @@ void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node)
   const auto block_shape_index{node.getInputs().at(ir::operation::SpaceToBatchND::BLOCK_SIZE)};
   const auto padding_index{node.getInputs().at(ir::operation::SpaceToBatchND::PADDINGS)};
 
-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
-  auto block_shape_alloc = _tensor_builder->portableAt(block_shape_index).get();
-  auto padding_alloc = _tensor_builder->portableAt(padding_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto input_tensor = _tensor_builder->portableAt(input_index).get();
+  auto block_shape_tensor = _tensor_builder->portableAt(block_shape_index).get();
+  auto padding_tensor = _tensor_builder->portableAt(padding_index).get();
 
   auto fn = std::make_unique<ops::SpaceToBatchNDLayer>();
 
-  fn->configure(input_alloc, block_shape_alloc, padding_alloc, output_alloc);
+  fn->configure(input_tensor, block_shape_tensor, padding_tensor, output_tensor);
+
+  _return_fn = std::move(fn);
+}
+
+void KernelGenerator::visit(const ir::operation::Quantize &node)
+{
+  const auto input_index{node.getInputs().at(ir::operation::Quantize::Input::INPUT)};
+  const auto output_index{node.getOutputs().at(0)};
+
+  auto input_tensor = _tensor_builder->portableAt(input_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+
+  auto fn = std::make_unique<ops::QuantizeLayer>();
+
+  fn->configure(input_tensor, output_tensor);
+
+  _return_fn = std::move(fn);
+}
+
+void KernelGenerator::visit(const ir::operation::SpaceToDepth &node)
+{
+  const auto input_index{node.getInputs().at(ir::operation::SpaceToDepth::Input::INPUT)};
+  const auto output_index{node.getOutputs().at(0)};
+  auto block_size = node.param().block_size;
+
+  auto input_tensor = _tensor_builder->portableAt(input_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+
+  auto fn = std::make_unique<ops::SpaceToDepthLayer>();
+
+  fn->configure(input_tensor, block_size, output_tensor);
 
   _return_fn = std::move(fn);
 }
diff --git a/runtime/onert/backend/cpu/KernelGenerator.h b/runtime/onert/backend/cpu/KernelGenerator.h
index d6f4c28..f564bf8 100644
--- a/runtime/onert/backend/cpu/KernelGenerator.h
+++ b/runtime/onert/backend/cpu/KernelGenerator.h
@@ -94,6 +94,7 @@ public:
   void visit(const ir::operation::SquaredDifference &) override;
   void visit(const ir::operation::Tile &) override;
   void visit(const ir::operation::LogicalOr &) override;
+  void visit(const ir::operation::L2Normalization &) override;
   void visit(const ir::operation::Range &) override;
   void visit(const ir::operation::MatrixBandPart &) override;
   void visit(const ir::operation::BatchMatMul &) override;
@@ -101,6 +102,8 @@ public:
   void visit(const ir::operation::FusedBatchNorm &) override;
   void visit(const ir::operation::LogSoftmax &) override;
   void visit(const ir::operation::SpaceToBatchND &) override;
+  void visit(const ir::operation::Quantize &) override;
+  void visit(const ir::operation::SpaceToDepth &) override;
 
 private:
   const ir::Operands &_ctx;
diff --git a/runtime/onert/backend/cpu/StaticTensorManager.cc b/runtime/onert/backend/cpu/StaticTensorManager.cc
new file mode 100644
index 0000000..8723072
--- /dev/null
+++ b/runtime/onert/backend/cpu/StaticTensorManager.cc
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "StaticTensorManager.h"
+#include "Tensor.h"
+
+#include <util/logging.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace cpu
+{
+
+StaticTensorManager::StaticTensorManager(const std::shared_ptr<cpu_common::TensorRegistry> &reg)
+    : _nonconst_mgr{new cpu_common::MemoryManager()}, _tensors{reg}
+{
+  // DO NOTHING
+}
+
+void StaticTensorManager::allocateNonconsts(void)
+{
+  _nonconst_mgr->allocate();
+
+  for (auto &pair : _tensors->native_tensors())
+  {
+    const auto &ind = pair.first;
+    auto tensor = pair.second;
+    if (!_as_constants[ind] && !tensor->is_dynamic())
+    {
+      auto *buffer = _nonconst_mgr->getBuffer(ind);
+      tensor->setBuffer(buffer);
+
+      VERBOSE(CPU_StaticTensorManager) << "TENSOR(#" << ind.value()
+                                       << "): " << static_cast<void *>(buffer) << std::endl;
+    }
+  }
+}
+
+void StaticTensorManager::deallocateNonconsts(void) { _nonconst_mgr->deallocate(); }
+
+void StaticTensorManager::buildTensor(const ir::OperandIndex &ind,
+                                      const ir::OperandInfo &tensor_info, ir::Layout backend_layout,
+                                      bool as_const)
+{
+  assert(!_tensors->getITensor(ind));
+  if (as_const)
+  {
+    auto tensor = std::make_shared<ExternalTensor>(tensor_info, backend_layout);
+    _tensors->setNativeTensor(ind, tensor);
+  }
+  else
+  {
+    auto tensor = std::make_shared<Tensor>(tensor_info, backend_layout);
+    _tensors->setNativeTensor(ind, tensor);
+  }
+  _as_constants[ind] = as_const;
+}
+
+void StaticTensorManager::claimPlan(const ir::OperandIndex &ind, uint32_t size)
+{
+  assert(_tensors->getITensor(ind));
+
+  // This method is called only when a tensor has proper shape
+  assert(!_tensors->getITensor(ind)->is_dynamic());
+
+  if (!_as_constants[ind])
+    _nonconst_mgr->claimPlan(ind, size);
+}
+
+void StaticTensorManager::releasePlan(const ir::OperandIndex &ind)
+{
+  assert(_tensors->getITensor(ind));
+
+  // This method is called only when a tensor has proper shape
+  assert(!_tensors->getITensor(ind)->is_dynamic());
+
+  if (!_as_constants[ind])
+    _nonconst_mgr->releasePlan(ind);
+}
+
+void StaticTensorManager::iterate(const std::function<void(const ir::OperandIndex &)> &fn)
+{
+  for (const auto &it : _tensors->native_tensors())
+    fn(it.first);
+}
+
+} // namespace cpu
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/cpu/StaticTensorManager.h b/runtime/onert/backend/cpu/StaticTensorManager.h
new file mode 100644
index 0000000..66243a5
--- /dev/null
+++ b/runtime/onert/backend/cpu/StaticTensorManager.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_CPU_STATICTENSOR_MANAGER_H__
+#define __ONERT_BACKEND_CPU_STATICTENSOR_MANAGER_H__
+
+#include "backend/IStaticTensorManager.h"
+#include "backend/cpu_common/MemoryManager.h"
+#include "backend/cpu_common/TensorRegistry.h"
+#include "backend/ITensorManager.h"
+#include "ir/OperandIndexMap.h"
+#include "ir/OperandInfo.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace cpu
+{
+
+class StaticTensorManager : public backend::IStaticTensorManager
+{
+public:
+  StaticTensorManager(const std::shared_ptr<cpu_common::TensorRegistry> &reg);
+  virtual ~StaticTensorManager() = default;
+
+  void allocateNonconsts(void);
+  void deallocateNonconsts(void);
+
+  void buildTensor(const ir::OperandIndex &ind, const ir::OperandInfo &tensor_info,
+                   ir::Layout backend_layout, bool as_const);
+
+  void claimPlan(const ir::OperandIndex &ind, uint32_t size);
+  void releasePlan(const ir::OperandIndex &ind);
+
+  void iterate(const std::function<void(const ir::OperandIndex &)> &fn);
+
+private:
+  std::unique_ptr<cpu_common::MemoryManager> _nonconst_mgr;
+  const std::shared_ptr<cpu_common::TensorRegistry> _tensors;
+  ir::OperandIndexMap<bool> _as_constants;
+};
+
+} // namespace cpu
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_CPU_STATICTENSOR_MANAGER_H__
diff --git a/runtime/onert/backend/cpu/Tensor.h b/runtime/onert/backend/cpu/Tensor.h
index 4dd251b..da16d05 100644
--- a/runtime/onert/backend/cpu/Tensor.h
+++ b/runtime/onert/backend/cpu/Tensor.h
@@ -29,8 +29,14 @@ namespace cpu
 
 using Tensor = cpu_common::Tensor;
 
-// Tensor which has data from external. To support this, assume below things
-// no padding, always NHWC layout, constant tensor and not dynamic
+/**
+ * @brief Class that uses data from external memory that is not managed by a backend
+ *        instead of allocating and copying the data. ExternalTensor's data pointer points to
+ *        an address of memory such as where memory is already allocated, or mmapped area.
+ *        This is meaning that ExternalTensor can take all of types' ir::Data.
+ *        To support this, assume below things no padding, always NHWC layout,
+ *        constant tensor and not dynamic.
+ */
 class ExternalTensor : public Tensor
 {
 public:
@@ -45,6 +51,11 @@ public:
   }
 
 public:
+  /**
+   * @brief     set Data to be shared from external so that this ExternalTensor will not be
+   *            allocated on CPU backend
+   * @param[in] data    data of Operand to be set
+   */
   void setData(const std::shared_ptr<ir::Data> data)
   {
     assert(data != nullptr);
diff --git a/runtime/onert/backend/cpu/TensorBuilder.cc b/runtime/onert/backend/cpu/TensorBuilder.cc
index 886e8d8..7eb3ce8 100644
--- a/runtime/onert/backend/cpu/TensorBuilder.cc
+++ b/runtime/onert/backend/cpu/TensorBuilder.cc
@@ -29,7 +29,7 @@ namespace cpu
 
 TensorBuilder::TensorBuilder()
     : _tensor_reg{new cpu_common::TensorRegistry()},
-      _static_tensor_mgr{new cpu_common::StaticTensorManager(_tensor_reg)},
+      _static_tensor_mgr{new StaticTensorManager(_tensor_reg)},
       _dynamic_tensor_mgr{new cpu_common::DynamicTensorManager(_tensor_reg)}
 {
   /* empty */
@@ -77,11 +77,7 @@ bool TensorBuilder::isRegistered(const ir::OperandIndex &ind) const
   return _tensor_info_map.find(ind) != _tensor_info_map.end();
 }
 
-void TensorBuilder::prepare(void)
-{
-  _static_tensor_mgr->allocateConsts();
-  _static_tensor_mgr->allocateNonconsts();
-}
+void TensorBuilder::prepare(void) { _static_tensor_mgr->allocateNonconsts(); }
 
 void TensorBuilder::allocate()
 {
@@ -99,17 +95,17 @@ std::shared_ptr<IPortableTensor> TensorBuilder::portableAt(const ir::OperandInde
   return _tensor_reg->getPortableTensor(ind);
 }
 
-bool TensorBuilder::setExternalTensor(const ir::OperandIndex &ind,
-                                      const std::shared_ptr<IPortableTensor> &tensor)
+bool TensorBuilder::setMigrantTensor(const ir::OperandIndex &ind,
+                                     const std::shared_ptr<IPortableTensor> &tensor)
 {
-  return _tensor_reg->setExternalTensor(ind, tensor);
+  return _tensor_reg->setMigrantTensor(ind, tensor);
 }
 
 void TensorBuilder::iterate(const IterateFunction &fn) { _static_tensor_mgr->iterate(fn); }
 
-std::shared_ptr<cpu_common::Tensor> TensorBuilder::at(const ir::OperandIndex &ind)
+std::shared_ptr<Tensor> TensorBuilder::at(const ir::OperandIndex &ind)
 {
-  return _tensor_reg->getManagedTensor(ind);
+  return _tensor_reg->getNativeTensor(ind);
 }
 
 std::unique_ptr<ITensorManager> TensorBuilder::releaseStaticTensorManager(void)
diff --git a/runtime/onert/backend/cpu/TensorBuilder.h b/runtime/onert/backend/cpu/TensorBuilder.h
index ba25451..12ca28c 100644
--- a/runtime/onert/backend/cpu/TensorBuilder.h
+++ b/runtime/onert/backend/cpu/TensorBuilder.h
@@ -18,13 +18,14 @@
 #define __ONERT_BACKEND_CPU_TENSOR_BUILDER_H__
 
 #include <backend/cpu_common/DynamicTensorManager.h>
-#include <backend/cpu_common/StaticTensorManager.h>
 #include <backend/cpu_common/TensorRegistry.h>
-#include <backend/cpu_common/Tensor.h>
 
 #include <backend/ITensorBuilder.h>
 #include <ir/OperandIndexMap.h>
 
+#include "StaticTensorManager.h"
+#include "Tensor.h"
+
 #include <unordered_map>
 
 namespace onert
@@ -80,16 +81,16 @@ public:
    *        If not, program will crash with assert or exception.
    * @return shared_ptr<Tensor>
    */
-  std::shared_ptr<cpu_common::Tensor> at(const ir::OperandIndex &ind);
+  std::shared_ptr<Tensor> at(const ir::OperandIndex &ind);
   std::shared_ptr<IPortableTensor> portableAt(const ir::OperandIndex &ind);
-  bool setExternalTensor(const ir::OperandIndex &ind,
-                         const std::shared_ptr<IPortableTensor> &tensor) override;
+  bool setMigrantTensor(const ir::OperandIndex &ind,
+                        const std::shared_ptr<IPortableTensor> &tensor) override;
 
   std::shared_ptr<ITensorRegistry> tensorRegistry() override { return _tensor_reg; }
 
 private:
   const std::shared_ptr<cpu_common::TensorRegistry> _tensor_reg;
-  std::unique_ptr<cpu_common::StaticTensorManager> _static_tensor_mgr;
+  std::unique_ptr<StaticTensorManager> _static_tensor_mgr;
   std::unique_ptr<cpu_common::DynamicTensorManager> _dynamic_tensor_mgr;
   ir::OperandIndexMap<ir::OperandInfo> _tensor_info_map;
 };
diff --git a/runtime/onert/backend/cpu/ops/CompareLayer.cc b/runtime/onert/backend/cpu/ops/CompareLayer.cc
index f557f3a..adf902a 100644
--- a/runtime/onert/backend/cpu/ops/CompareLayer.cc
+++ b/runtime/onert/backend/cpu/ops/CompareLayer.cc
@@ -17,6 +17,7 @@
 
 #include "OperationUtils.h"
 
+#include <assert.h>
 #include <cker/operation/Comparison.h>
 using namespace nnfw::cker;
 namespace onert
@@ -34,6 +35,14 @@ namespace
 using OpType = onert::ir::operation::Comparison::ComparisonType;
 using namespace onert::backend::cpu;
 
+// Assumes these enum values to be in the order like this
+static_assert(static_cast<int>(OpType::Equal) == 0, "An OpType value has changed!");
+static_assert(static_cast<int>(OpType::NotEqual) == 1, "An OpType value has changed!");
+static_assert(static_cast<int>(OpType::Greater) == 2, "An OpType value has changed!");
+static_assert(static_cast<int>(OpType::GreaterEqual) == 3, "An OpType value has changed!");
+static_assert(static_cast<int>(OpType::Less) == 4, "An OpType value has changed!");
+static_assert(static_cast<int>(OpType::LessEqual) == 5, "An OpType value has changed!");
+
 template <typename T>
 void compareQuant8(const IPortableTensor *lhs, const IPortableTensor *rhs, IPortableTensor *output,
                    OpType op_type)
@@ -52,95 +61,33 @@ void compareQuant8(const IPortableTensor *lhs, const IPortableTensor *rhs, IPort
                                       &params.input2_shift);
   params.is_broadcast = !HaveSameShapes(lhs, rhs);
 
-  if (params.is_broadcast)
-  {
-    switch (op_type)
-    {
-      case OpType::Equal:
-        Broadcast4DSlowEqualWithScaling(
-            params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
-            getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
-            getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
-        break;
-      case OpType::NotEqual:
-        Broadcast4DSlowNotEqualWithScaling(
-            params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
-            getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
-            getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
-        break;
-      case OpType::Greater:
-        Broadcast4DSlowGreaterWithScaling(
-            params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
-            getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
-            getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
-        break;
-      case OpType::GreaterEqual:
-        Broadcast4DSlowGreaterEqualWithScaling(
-            params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
-            getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
-            getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
-        break;
-      case OpType::Less:
-        Broadcast4DSlowLessWithScaling(
-            params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
-            getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
-            getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
-        break;
-      case OpType::LessEqual:
-        Broadcast4DSlowLessEqualWithScaling(
-            params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
-            getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
-            getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
-        break;
-      default:
-        throw std::runtime_error{"Invalid OpType for CompareLayer"};
-    }
-  }
-  else // if (requires_broadcast == false)
-  {
-    switch (op_type)
-    {
-      case OpType::Equal:
-        EqualWithScaling(params, getExtendedTensorShape(lhs),
-                         reinterpret_cast<const T *>(lhs->buffer()), getExtendedTensorShape(rhs),
-                         reinterpret_cast<const T *>(rhs->buffer()), getExtendedTensorShape(output),
-                         reinterpret_cast<bool *>(output->buffer()));
-        break;
-      case OpType::NotEqual:
-        NotEqualWithScaling(
-            params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
-            getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
-            getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
-        break;
-      case OpType::Greater:
-        GreaterWithScaling(
-            params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
-            getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
-            getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
-        break;
-      case OpType::GreaterEqual:
-        GreaterEqualWithScaling(
-            params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
-            getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
-            getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
-        break;
-      case OpType::Less:
-        LessWithScaling(params, getExtendedTensorShape(lhs),
-                        reinterpret_cast<const T *>(lhs->buffer()), getExtendedTensorShape(rhs),
-                        reinterpret_cast<const T *>(rhs->buffer()), getExtendedTensorShape(output),
-                        reinterpret_cast<bool *>(output->buffer()));
-        break;
-      case OpType::LessEqual:
-        LessEqualWithScaling(
-            params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
-            getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
-            getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
-        break;
-      default:
-        throw std::runtime_error{"Invalid OpType for CompareLayer"};
-    }
-  }
-  return;
+  using CompareFunction =
+      void (*)(ComparisonParams & params, const Shape &input1_shape, const T *input1_data,
+               const Shape &input2_shape, const T *input2_data, const Shape &output_shape,
+               bool *output_data);
+
+  static const CompareFunction broadcast_fns[] = {
+      Broadcast4DSlowEqualWithScaling,   Broadcast4DSlowNotEqualWithScaling,
+      Broadcast4DSlowGreaterWithScaling, Broadcast4DSlowGreaterEqualWithScaling,
+      Broadcast4DSlowLessWithScaling,    Broadcast4DSlowLessEqualWithScaling,
+  };
+  static const CompareFunction non_broadcast_fns[] = {
+      EqualWithScaling,        NotEqualWithScaling, GreaterWithScaling,
+      GreaterEqualWithScaling, LessWithScaling,     LessEqualWithScaling,
+  };
+
+  static_assert(sizeof(broadcast_fns) == sizeof(non_broadcast_fns),
+                "Sizes of broadcast_fns and non_broadcast_fns must match!");
+
+  auto index = static_cast<int>(op_type);
+  if (index < 0 || index >= static_cast<int>(sizeof(broadcast_fns) / sizeof(broadcast_fns[0])))
+    throw std::runtime_error{"Invalid OpType for CompareLayer"};
+
+  CompareFunction fn = (params.is_broadcast ? broadcast_fns[index] : non_broadcast_fns[index]);
+
+  fn(params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
+     getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
+     getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
 }
 
 template <typename T>
@@ -149,94 +96,33 @@ void compareScalar(const IPortableTensor *lhs, const IPortableTensor *rhs, IPort
 {
   bool requires_broadcast = !HaveSameShapes(lhs, rhs);
 
-  if (requires_broadcast)
-  {
-    switch (op_type)
-    {
-      case OpType::Equal:
-        Broadcast4DSlowEqual(
-            getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
-            getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
-            getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
-        break;
-      case OpType::NotEqual:
-        Broadcast4DSlowNotEqual(
-            getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
-            getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
-            getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
-        break;
-      case OpType::Greater:
-        Broadcast4DSlowGreater(
-            getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
-            getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
-            getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
-        break;
-      case OpType::GreaterEqual:
-        Broadcast4DSlowGreaterEqual(
-            getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
-            getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
-            getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
-        break;
-      case OpType::Less:
-        Broadcast4DSlowLess(getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
-                            getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
-                            getExtendedTensorShape(output),
-                            reinterpret_cast<bool *>(output->buffer()));
-        break;
-      case OpType::LessEqual:
-        Broadcast4DSlowLessEqual(
-            getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
-            getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
-            getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
-        break;
-      default:
-        throw std::runtime_error{"Invalid OpType for CompareLayer"};
-    }
-  }
-  else // if (requires_broadcast == false)
-  {
-    switch (op_type)
-    {
-      case OpType::Equal:
-        EqualNoScaling(getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
-                       getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
-                       getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
-        break;
-      case OpType::NotEqual:
-        NotEqualNoScaling(getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
-                          getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
-                          getExtendedTensorShape(output),
-                          reinterpret_cast<bool *>(output->buffer()));
-        break;
-      case OpType::Greater:
-        GreaterNoScaling(getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
-                         getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
-                         getExtendedTensorShape(output),
-                         reinterpret_cast<bool *>(output->buffer()));
-        break;
-      case OpType::GreaterEqual:
-        GreaterEqualNoScaling(
-            getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
-            getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
-            getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
-        break;
-      case OpType::Less:
-        LessNoScaling(getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
-                      getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
-                      getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
-        break;
-      case OpType::LessEqual:
-        LessEqualNoScaling(getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
-                           getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
-                           getExtendedTensorShape(output),
-                           reinterpret_cast<bool *>(output->buffer()));
-        break;
-      default:
-        throw std::runtime_error{"Invalid OpType for CompareLayer"};
-    }
-  }
-  return;
+  using CompareFunction =
+      void (*)(const Shape &input1_shape, const T *input1_data, const Shape &input2_shape,
+               const T *input2_data, const Shape &output_shape, bool *output_data);
+
+  static const CompareFunction broadcast_fns[] = {
+      Broadcast4DSlowEqual,        Broadcast4DSlowNotEqual, Broadcast4DSlowGreater,
+      Broadcast4DSlowGreaterEqual, Broadcast4DSlowLess,     Broadcast4DSlowLessEqual,
+  };
+  static const CompareFunction non_broadcast_fns[] = {
+      EqualNoScaling,        NotEqualNoScaling, GreaterNoScaling,
+      GreaterEqualNoScaling, LessNoScaling,     LessEqualNoScaling,
+  };
+
+  static_assert(sizeof(broadcast_fns) == sizeof(non_broadcast_fns),
+                "Sizes of broadcast_fns and non_broadcast_fns must match!");
+
+  auto index = static_cast<int>(op_type);
+  if (index < 0 || index >= static_cast<int>(sizeof(broadcast_fns) / sizeof(broadcast_fns[0])))
+    throw std::runtime_error{"Invalid OpType for CompareLayer"};
+
+  CompareFunction fn = (requires_broadcast ? broadcast_fns[index] : non_broadcast_fns[index]);
+
+  fn(getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
+     getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
+     getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
 }
+
 } // namespace
 
 CompareLayer::CompareLayer()
diff --git a/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc b/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc
index c00be64..ff22e32 100644
--- a/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc
+++ b/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc
@@ -18,6 +18,7 @@
 
 #include "../Tensor.h"
 #include <cker/operation/FullyConnected.h>
+#include <cker/TensorUtils.h>
 
 namespace onert
 {
@@ -112,15 +113,32 @@ void FullyConnectedLayer::fullyConnectedHybrid()
       getTensorShape(_bias), reinterpret_cast<const float *>(_bias ? _bias->buffer() : nullptr),
       getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()), temp_arena);
 
-// TODO Enable calling decrease_ref
-#if 0
+// TODO Remove this ifdef
+#ifdef EXPERIMENTAL_RUY_FEATURE
   if (_cached_weights == nullptr || _is_weights_freed)
     return;
 
+  // '_cached_weights is not nullptr and _is_weights_freed is false' means
+  // this weight shape is satisfied with the ruy kernel's prepack cache's condition.
+  // After entering here, it will not enter again except below the case - input is zero-vector
+
+  // if input's elements are filled with zero, it by-passes(does not enter ruy-kernel path)
+  // so that handle this case
+  const int input_size = getTensorShape(_input).FlatSize();
+  if (nnfw::cker::IsZeroVector(reinterpret_cast<float *>(_input->buffer()), input_size))
+    return;
+
+  // This weight tensor could be other ops' const tensor.
+  // Therefore, below reference should be checked like following
   auto weight_tensor = dynamic_cast<const Tensor *>(_weights);
   if (weight_tensor)
   {
     auto tensor = const_cast<Tensor *>(weight_tensor);
+    if (tensor->buffer() == nullptr) // ref is already 0?
+    {
+      _is_weights_freed = true;
+      return;
+    }
 
     tensor->decrease_ref();
     if (tensor->buffer() == nullptr) // ref == 0?
@@ -128,7 +146,7 @@ void FullyConnectedLayer::fullyConnectedHybrid()
       _is_weights_freed = true;
     }
   }
-#endif // if 0
+#endif
 #endif
 }
 
@@ -167,7 +185,17 @@ void FullyConnectedLayer::run()
 
 void FullyConnectedLayer::prepare()
 {
+  if (_bias && _bias->is_constant())
+  {
+    const int bias_size = getTensorShape(_bias).FlatSize();
+    if (nnfw::cker::IsZeroVector(reinterpret_cast<float *>(_bias->buffer()), bias_size))
+    {
+      _bias = nullptr;
+    }
+  }
+
 #ifdef USE_RUY_GEMV
+#ifdef EXPERIMENTAL_RUY_FEATURE
   // TODO This is workaround
   // The only fc hybrid will use ruy kernel
   if (_input->data_type() != OperandType::FLOAT32 ||
@@ -199,6 +227,7 @@ void FullyConnectedLayer::prepare()
     }
   }
 #endif
+#endif
 }
 
 } // namespace ops
diff --git a/runtime/onert/backend/cpu/ops/FullyConnectedLayer.h b/runtime/onert/backend/cpu/ops/FullyConnectedLayer.h
index dd5ef24..e405b24 100644
--- a/runtime/onert/backend/cpu/ops/FullyConnectedLayer.h
+++ b/runtime/onert/backend/cpu/ops/FullyConnectedLayer.h
@@ -72,6 +72,9 @@ private:
 
 #ifdef USE_RUY_GEMV
   uint8_t *_cached_weights = nullptr; // weights to be cached and a key
+#ifdef EXPERIMENTAL_RUY_FEATURE
+  bool _is_weights_freed = false; // is weights freed?
+#endif
 #endif
 };
 
diff --git a/runtime/onert/backend/cpu/ops/L2NormLayer.cc b/runtime/onert/backend/cpu/ops/L2NormLayer.cc
new file mode 100644
index 0000000..0d99b05
--- /dev/null
+++ b/runtime/onert/backend/cpu/ops/L2NormLayer.cc
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "L2NormLayer.h"
+
+#include "OperationUtils.h"
+
+#include <cker/operation/L2Normalize.h>
+#include <cker/Types.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace cpu
+{
+namespace ops
+{
+
+void L2NormLayer::configure(const IPortableTensor *input, IPortableTensor *output)
+{
+  assert(input != nullptr);
+  assert(output != nullptr);
+
+  _input = input;
+  _output = output;
+}
+
+void L2NormLayer::run()
+{
+  switch (_input->data_type())
+  {
+    case OperandType::FLOAT32:
+      nnfw::cker::L2NormalizeFloat32(
+          getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()),
+          getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()));
+      break;
+
+    case OperandType::QUANT_UINT8_ASYMM:
+    {
+      nnfw::cker::L2NormParams params;
+      assert(_input->data_offset() == 128);
+      params.input_zero_point = _input->data_offset();
+      nnfw::cker::L2NormalizeQuant8(
+          params, getTensorShape(_input), reinterpret_cast<const uint8_t *>(_input->buffer()),
+          getTensorShape(_output), reinterpret_cast<uint8_t *>(_output->buffer()));
+    }
+    break;
+
+    default:
+      throw std::runtime_error{"L2Norm: Unsupported data type"};
+  }
+}
+
+} // namespace ops
+} // namespace cpu
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/cpu/ops/L2NormLayer.h b/runtime/onert/backend/cpu/ops/L2NormLayer.h
new file mode 100644
index 0000000..63f2d11
--- /dev/null
+++ b/runtime/onert/backend/cpu/ops/L2NormLayer.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in riting, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_CPU_OPS_L2NORM_LAYER_H__
+#define __ONERT_BACKEND_CPU_OPS_L2NORM_LAYER_H__
+
+#include <backend/IPortableTensor.h>
+
+#include <exec/IFunction.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace cpu
+{
+namespace ops
+{
+class L2NormLayer : public ::onert::exec::IFunction
+{
+public:
+  L2NormLayer() : _input(nullptr), _output(nullptr)
+  {
+    // Nothing
+  }
+
+public:
+  void configure(const IPortableTensor *_input, IPortableTensor *output);
+
+  void run() override;
+
+private:
+  const IPortableTensor *_input;
+  IPortableTensor *_output;
+};
+
+} // namespace ops
+} // namespace cpu
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_CPU_OPS_L2NORM_LAYER_H__
diff --git a/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.cc b/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.cc
index d71e325..06dde4f 100644
--- a/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.cc
+++ b/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.cc
@@ -49,8 +49,8 @@ void LogSoftMaxLayer::logsoftmaxQuant8()
   // NYI
 }
 
-void LogSoftMaxLayer::configure(const Tensor *input, const float beta, const int axis,
-                                Tensor *output)
+void LogSoftMaxLayer::configure(const IPortableTensor *input, const float beta, const int axis,
+                                IPortableTensor *output)
 {
   _input = input;
   _output = output;
diff --git a/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.h b/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.h
index bc145ce..ba9deca 100644
--- a/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.h
+++ b/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.h
@@ -40,13 +40,14 @@ public:
 
   void logsoftmaxQuant8();
 
-  void configure(const Tensor *input, const float beta, const int axis, Tensor *output);
+  void configure(const IPortableTensor *input, const float beta, const int axis,
+                 IPortableTensor *output);
 
   void run();
 
 private:
-  const Tensor *_input;
-  Tensor *_output;
+  const IPortableTensor *_input;
+  IPortableTensor *_output;
 
   float _beta;
   int _axis;
diff --git a/runtime/onert/backend/cpu/ops/OperationUtils.h b/runtime/onert/backend/cpu/ops/OperationUtils.h
index 8d29374..9838552 100644
--- a/runtime/onert/backend/cpu/ops/OperationUtils.h
+++ b/runtime/onert/backend/cpu/ops/OperationUtils.h
@@ -52,6 +52,17 @@ union DataPtr {
   void *v;
 };
 
+union ConstDataPtr {
+  const uint8_t *u8;
+  const int8_t *i8;
+  const uint32_t *u32;
+  const int32_t *i32;
+  const bool *b;
+  const float *f;
+  const int64_t *i64;
+  const void *v;
+};
+
 uint32_t getNumberOfDimensions(const IPortableTensor *tensor);
 
 uint32_t getNumberOfElements(const IPortableTensor *tensor);
diff --git a/runtime/onert/backend/cpu/ops/PadLayer.cc b/runtime/onert/backend/cpu/ops/PadLayer.cc
index fcfcf7b..6a2bf9d 100644
--- a/runtime/onert/backend/cpu/ops/PadLayer.cc
+++ b/runtime/onert/backend/cpu/ops/PadLayer.cc
@@ -33,33 +33,40 @@ PadLayer::PadLayer()
   // DO NOTHING
 }
 
-void PadLayer::padFloat32()
+template <typename T> void PadLayer::padImpl(const T *constant_value_data)
 {
-  nnfw::cker::Pad(_padData, _padRank, getTensorShape(_input),
-                  reinterpret_cast<const float *>(_input->buffer()), getTensorShape(_output),
-                  reinterpret_cast<float *>(_output->buffer()), _constantValueData.f);
+  nnfw::cker::Pad<T>(_padData, _padRank, getTensorShape(_input),
+                     reinterpret_cast<const T *>(_input->buffer()), getTensorShape(_output),
+                     reinterpret_cast<T *>(_output->buffer()), constant_value_data);
 }
-void PadLayer::padQuant8() { throw std::runtime_error("Quantized Pad isn't supported NYI"); }
 
 void PadLayer::configure(const IPortableTensor *input, IPortableTensor *output,
-                         const int32_t *padData, int32_t padRank, uint8_t *constantValueData)
+                         const int32_t *padData, int32_t padRank, const void *constantValueData)
 {
   _input = input;
   _output = output;
   memcpy(_padData, padData, sizeof(_padData));
   _padRank = padRank;
-  _constantValueData.u8 = constantValueData;
+  _constantValueData.v = constantValueData;
 }
 
 void PadLayer::run()
 {
   if (_input->data_type() == OperandType::FLOAT32)
   {
-    padFloat32();
+    padImpl<float>(_constantValueData.f);
   }
   else if (_input->data_type() == OperandType::QUANT_UINT8_ASYMM)
   {
-    padQuant8();
+    if (_constantValueData.u8 == nullptr)
+    {
+      uint8_t pad_value = static_cast<uint8_t>(_output->data_offset());
+      padImpl<uint8_t>(&pad_value);
+    }
+    else
+    {
+      padImpl<uint8_t>(_constantValueData.u8);
+    }
   }
   else
   {
diff --git a/runtime/onert/backend/cpu/ops/PadLayer.h b/runtime/onert/backend/cpu/ops/PadLayer.h
index 85bd2e6..efd73d5 100644
--- a/runtime/onert/backend/cpu/ops/PadLayer.h
+++ b/runtime/onert/backend/cpu/ops/PadLayer.h
@@ -39,12 +39,10 @@ public:
   PadLayer();
 
 public:
-  void padFloat32();
-
-  void padQuant8();
+  template <typename T> void padImpl(const T *constant_value_data);
 
   void configure(const IPortableTensor *input, IPortableTensor *output, const int32_t *padData,
-                 int32_t padRank, uint8_t *constantValueData = nullptr);
+                 int32_t padRank, const void *constantValueData = nullptr);
 
   void run() override;
 
@@ -54,7 +52,7 @@ private:
 
   int32_t _padData[8];
   int32_t _padRank;
-  DataPtr _constantValueData;
+  ConstDataPtr _constantValueData;
 };
 
 } // namespace ops
diff --git a/runtime/onert/backend/cpu/ops/QuantizeLayer.cc b/runtime/onert/backend/cpu/ops/QuantizeLayer.cc
new file mode 100644
index 0000000..45fc148
--- /dev/null
+++ b/runtime/onert/backend/cpu/ops/QuantizeLayer.cc
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "QuantizeLayer.h"
+
+#include <cker/operation/Quantize.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace cpu
+{
+namespace ops
+{
+
+QuantizeLayer::QuantizeLayer() : _input(nullptr), _output(nullptr)
+{
+  // DO NOTHING
+}
+
+template <typename InputT, typename OutputT> void QuantizeLayer::affineQuantize()
+{
+  nnfw::cker::Quantize(getTensorShape(_input), reinterpret_cast<const InputT *>(_input->buffer()),
+                       getTensorShape(_output), reinterpret_cast<OutputT *>(_output->buffer()),
+                       _output->data_scale(), _output->data_offset());
+}
+
+void QuantizeLayer::configure(const IPortableTensor *input, IPortableTensor *output)
+{
+  _input = input;
+  _output = output;
+}
+
+void QuantizeLayer::run()
+{
+  if (_input->data_type() == OperandType::FLOAT32)
+  {
+    affineQuantize<float, uint8_t>();
+  }
+  else
+  {
+    throw std::runtime_error{"Quantize: unsupported data type"};
+  }
+}
+
+} // namespace ops
+} // namespace cpu
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/cpu/ops/QuantizeLayer.h b/runtime/onert/backend/cpu/ops/QuantizeLayer.h
new file mode 100644
index 0000000..b4e7aca
--- /dev/null
+++ b/runtime/onert/backend/cpu/ops/QuantizeLayer.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_CPU_OPS_QUANTIZELAYER_H__
+#define __ONERT_BACKEND_CPU_OPS_QUANTIZELAYER_H__
+
+#include <backend/IPortableTensor.h>
+#include "OperationUtils.h"
+
+#include <exec/IFunction.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace cpu
+{
+namespace ops
+{
+
+class QuantizeLayer : public ::onert::exec::IFunction
+{
+public:
+  QuantizeLayer();
+
+public:
+  template <typename InputT, typename OutputT> void affineQuantize();
+
+  void configure(const IPortableTensor *input, IPortableTensor *output);
+
+  void run() override;
+
+private:
+  const IPortableTensor *_input;
+  IPortableTensor *_output;
+};
+
+} // namespace ops
+} // namespace cpu
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_CPU_OPS_QUANTIZELAYER_H__
diff --git a/runtime/onert/backend/cpu/ops/SliceLayer.cc b/runtime/onert/backend/cpu/ops/SliceLayer.cc
index a9106c1..449c073 100644
--- a/runtime/onert/backend/cpu/ops/SliceLayer.cc
+++ b/runtime/onert/backend/cpu/ops/SliceLayer.cc
@@ -46,7 +46,7 @@ void SliceLayer::GetBeginAndSizeVectors(int dimensions, const IPortableTensor *b
   }
 }
 
-void SliceLayer::sliceFloat32()
+template <typename T> void SliceLayer::sliceImpl()
 {
   const int kMaxDim = nnfw::cker::Shape::kMaxSmallSize;
 
@@ -74,14 +74,8 @@ void SliceLayer::sliceFloat32()
   }
 
   nnfw::cker::Slice(op_params, getExtendedTensorShape(_input),
-                    reinterpret_cast<const float *>(_input->buffer()),
-                    reinterpret_cast<float *>(_output->buffer()));
-}
-
-void SliceLayer::sliceQuant8()
-{
-  // cker quant8 slice is not implemented yet
-  throw std::runtime_error{"NYI"};
+                    reinterpret_cast<const T *>(_input->buffer()),
+                    reinterpret_cast<T *>(_output->buffer()));
 }
 
 void SliceLayer::configure(const IPortableTensor *input, const IPortableTensor *begin,
@@ -97,11 +91,11 @@ void SliceLayer::run()
 {
   if (_input->data_type() == OperandType::FLOAT32)
   {
-    sliceFloat32();
+    sliceImpl<float>();
   }
   else if (_input->data_type() == OperandType::QUANT_UINT8_ASYMM)
   {
-    sliceQuant8();
+    sliceImpl<uint8_t>();
   }
   else
   {
diff --git a/runtime/onert/backend/cpu/ops/SliceLayer.h b/runtime/onert/backend/cpu/ops/SliceLayer.h
index 9945d7e..650e2c9 100644
--- a/runtime/onert/backend/cpu/ops/SliceLayer.h
+++ b/runtime/onert/backend/cpu/ops/SliceLayer.h
@@ -42,8 +42,7 @@ public:
   void run() override;
 
 private:
-  void sliceFloat32();
-  void sliceQuant8();
+  template <typename T> void sliceImpl();
 
   template <typename T>
   void GetBeginAndSizeVectors(int dimensions, const IPortableTensor *begin,
diff --git a/runtime/onert/backend/cpu/ops/SpaceToDepthLayer.cc b/runtime/onert/backend/cpu/ops/SpaceToDepthLayer.cc
new file mode 100644
index 0000000..110b0bc
--- /dev/null
+++ b/runtime/onert/backend/cpu/ops/SpaceToDepthLayer.cc
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "SpaceToDepthLayer.h"
+
+#include "OperationUtils.h"
+
+#include <cker/operation/SpaceToDepth.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace cpu
+{
+namespace ops
+{
+SpaceToDepthLayer::SpaceToDepthLayer() : _input(nullptr), _block_size(0), _output(nullptr)
+{
+  // DO NOTHING
+}
+
+template <typename T> void SpaceToDepthLayer::spaceToDepth()
+{
+
+  nnfw::cker::SpaceToDepthParams params;
+  params.block_size = _block_size;
+
+  nnfw::cker::SpaceToDepth(params, getTensorShape(_input),
+                           reinterpret_cast<const float *>(_input->buffer()),
+                           getTensorShape(_output), reinterpret_cast<T *>(_output->buffer()));
+}
+
+void SpaceToDepthLayer::configure(const IPortableTensor *input, const int32_t block_size,
+                                  IPortableTensor *output)
+{
+  _input = input;
+  _block_size = block_size;
+  _output = output;
+}
+
+void SpaceToDepthLayer::run()
+{
+  if (_input->data_type() == OperandType::FLOAT32)
+  {
+    spaceToDepth<float>();
+  }
+  else
+  {
+    throw std::runtime_error{"SpaceToDepth: unsupported data type"};
+  }
+}
+
+} // namespace ops
+} // namespace cpu
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/cpu/ops/SpaceToDepthLayer.h b/runtime/onert/backend/cpu/ops/SpaceToDepthLayer.h
new file mode 100644
index 0000000..c11ef2b
--- /dev/null
+++ b/runtime/onert/backend/cpu/ops/SpaceToDepthLayer.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in riting, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_CPU_OPS_SPACE_TO_DEPTH_LAYER_H__
+#define __ONERT_BACKEND_CPU_OPS_SPACE_TO_DEPTH_LAYER_H__
+
+#include <backend/IPortableTensor.h>
+
+#include <exec/IFunction.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace cpu
+{
+namespace ops
+{
+class SpaceToDepthLayer : public ::onert::exec::IFunction
+{
+public:
+  SpaceToDepthLayer();
+
+  void configure(const IPortableTensor *input, const int32_t block_size, IPortableTensor *output);
+
+  void run() override;
+
+private:
+  template <typename T> void spaceToDepth();
+
+  const IPortableTensor *_input;
+  int32_t _block_size;
+  IPortableTensor *_output;
+};
+
+} // namespace ops
+} // namespace cpu
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_CPU_OPS_SPACE_TO_BATCH_ND_LAYER_H__
diff --git a/runtime/onert/core/include/backend/ITensorBuilder.h b/runtime/onert/core/include/backend/ITensorBuilder.h
index a49525b..b760cda 100644
--- a/runtime/onert/core/include/backend/ITensorBuilder.h
+++ b/runtime/onert/core/include/backend/ITensorBuilder.h
@@ -112,12 +112,12 @@ public: // methods for static tensor allocation
   virtual std::shared_ptr<ITensor> tensorAt(const ir::OperandIndex &ind) = 0;
 
   /**
-   * @brief Set the External Tensor object
+   * @brief Set the migrant tensor object
    *
    * @return true if succeeded
    * @return false if failed or unsupported
    */
-  virtual bool setExternalTensor(const ir::OperandIndex &, const std::shared_ptr<IPortableTensor> &)
+  virtual bool setMigrantTensor(const ir::OperandIndex &, const std::shared_ptr<IPortableTensor> &)
   {
     return false;
   }
diff --git a/runtime/onert/core/include/backend/ITensorRegistry.h b/runtime/onert/core/include/backend/ITensorRegistry.h
index f5a95f4..8555131 100644
--- a/runtime/onert/core/include/backend/ITensorRegistry.h
+++ b/runtime/onert/core/include/backend/ITensorRegistry.h
@@ -35,17 +35,22 @@ struct ITensorRegistry
   virtual ~ITensorRegistry() = default;
 
   /**
-   * @brief Returns pointer of ITensor among managed and external tensors
+   * @brief Returns pointer of ITensor among native and migrant tensors
+   *
+   * Native Tensor is a tensor that is managed by this backend
+   * Migrant Tensor is a tensor that is imported from another backend
+   *
    * @note  Return tensor cannot be used longer than dynamic tensor manager
    */
   virtual std::shared_ptr<ITensor> getITensor(const ir::OperandIndex &) = 0;
   /**
-   * @brief Returns pointer of ITensor among managed tensors
+   * @brief Returns pointer of ITensor among native tensors
    *
-   * Unlike @c getITensor , this function only searches from managed tensors
-   * @note  Return tensor cannot be used longer than dynamic tensor manager
+   * Unlike @c getITensor , this function only searches from native tensors
+   *
+   * @note  Returned tensor cannot be used longer than dynamic tensor manager
    */
-  virtual std::shared_ptr<ITensor> getManagedITensor(const ir::OperandIndex &) = 0;
+  virtual std::shared_ptr<ITensor> getNativeITensor(const ir::OperandIndex &) = 0;
 };
 
 } // namespace backend
@@ -73,68 +78,67 @@ public:
   std::shared_ptr<ITensor> getITensor(const ir::OperandIndex &ind) override
   {
     static_assert(std::is_base_of<ITensor, T_Tensor>::value, "T_Tensor must derive from ITensor.");
-    auto external_tensor = _external.find(ind);
-    if (external_tensor != _external.end())
+    auto external_tensor = _migrant.find(ind);
+    if (external_tensor != _migrant.end())
       return external_tensor->second;
-    return getManagedTensor(ind);
+    return getNativeTensor(ind);
   }
 
-  std::shared_ptr<ITensor> getManagedITensor(const ir::OperandIndex &ind) override
+  std::shared_ptr<ITensor> getNativeITensor(const ir::OperandIndex &ind) override
   {
-    return getManagedTensor(ind);
+    return getNativeTensor(ind);
   }
 
   std::shared_ptr<IPortableTensor> getPortableTensor(const ir::OperandIndex &ind)
   {
-    auto external_tensor = _external.find(ind);
-    if (external_tensor != _external.end())
+    auto external_tensor = _migrant.find(ind);
+    if (external_tensor != _migrant.end())
     {
       if (external_tensor->second)
         return external_tensor->second;
     }
-    return getManagedTensor(ind);
+    return getNativeTensor(ind);
   }
 
-  std::shared_ptr<T_Tensor> getManagedTensor(const ir::OperandIndex &ind)
+  std::shared_ptr<T_Tensor> getNativeTensor(const ir::OperandIndex &ind)
   {
-    auto tensor = _managed.find(ind);
-    if (tensor != _managed.end())
+    auto tensor = _native.find(ind);
+    if (tensor != _native.end())
       return tensor->second;
     return nullptr;
   }
 
-  bool setExternalTensor(const ir::OperandIndex &ind,
-                         const std::shared_ptr<IPortableTensor> &tensor)
+  bool setMigrantTensor(const ir::OperandIndex &ind, const std::shared_ptr<IPortableTensor> &tensor)
   {
     // TODO Uncomment this as two tensors for an index is not allowed.
     //      But now it is temporarily allowed as a workaround. External one hides Managed one.
-    // auto itr = _managed.find(ind);
-    // if (itr != _managed.end() && itr->second != nullptr && tensor != nullptr)
+    // auto itr = _native.find(ind);
+    // if (itr != _native.end() && itr->second != nullptr && tensor != nullptr)
     //  throw std::runtime_error{
-    //      "Tried to set an external tensor but an managed tensor already exists."};
-    _external[ind] = tensor;
+    //      "Tried to set an migrant tensor but an native tensor already exists."};
+    _migrant[ind] = tensor;
     return true;
   }
 
-  void setManagedTensor(const ir::OperandIndex &ind, const std::shared_ptr<T_Tensor> &tensor)
+  void setNativeTensor(const ir::OperandIndex &ind, const std::shared_ptr<T_Tensor> &tensor)
   {
-    auto itr = _external.find(ind);
-    if (itr != _external.end() && itr->second != nullptr && tensor != nullptr)
+    auto itr = _migrant.find(ind);
+    if (itr != _migrant.end() && itr->second != nullptr && tensor != nullptr)
       throw std::runtime_error{
-          "Tried to set a managed tensor but an external tensor already exists."};
-    _managed[ind] = tensor;
+          "Tried to set a native tensor but an migrant tensor already exists."};
+    _native[ind] = tensor;
   }
 
-  const ir::OperandIndexMap<std::shared_ptr<T_Tensor>> &managed_tensors() { return _managed; }
+  const ir::OperandIndexMap<std::shared_ptr<T_Tensor>> &native_tensors() { return _native; }
 
-  const ir::OperandIndexMap<std::shared_ptr<IPortableTensor>> &external_tensors()
+  const ir::OperandIndexMap<std::shared_ptr<IPortableTensor>> &migrant_tensors()
   {
-    return _external;
+    return _migrant;
   }
 
 private:
-  ir::OperandIndexMap<std::shared_ptr<IPortableTensor>> _external;
-  ir::OperandIndexMap<std::shared_ptr<T_Tensor>> _managed;
+  ir::OperandIndexMap<std::shared_ptr<IPortableTensor>> _migrant;
+  ir::OperandIndexMap<std::shared_ptr<T_Tensor>> _native;
 };
 
 } // namespace backend
diff --git a/runtime/onert/core/include/backend/cpu_common/StaticTensorManager.h b/runtime/onert/core/include/backend/cpu_common/StaticTensorManager.h
index 6ddacc7..a7e034a 100644
--- a/runtime/onert/core/include/backend/cpu_common/StaticTensorManager.h
+++ b/runtime/onert/core/include/backend/cpu_common/StaticTensorManager.h
@@ -19,7 +19,7 @@
 
 #include "MemoryManager.h"
 
-#include "backend/ITensorManager.h"
+#include "backend/IStaticTensorManager.h"
 #include "ir/OperandIndexMap.h"
 #include "ir/OperandInfo.h"
 #include "TensorRegistry.h"
@@ -31,7 +31,7 @@ namespace backend
 namespace cpu_common
 {
 
-class StaticTensorManager : public backend::ITensorManager
+class StaticTensorManager : public backend::IStaticTensorManager
 {
 public:
   StaticTensorManager(const std::shared_ptr<TensorRegistry> &reg);
diff --git a/runtime/onert/core/include/compiler/StaticShapeInference.h b/runtime/onert/core/include/compiler/StaticShapeInference.h
index 379143b..b3391a3 100644
--- a/runtime/onert/core/include/compiler/StaticShapeInference.h
+++ b/runtime/onert/core/include/compiler/StaticShapeInference.h
@@ -99,6 +99,7 @@ private:
   void visit(const ir::operation::LogicalNot &op) override;
   void visit(const ir::operation::LogicalOr &op) override;
   void visit(const ir::operation::Logistic &op) override;
+  void visit(const ir::operation::L2Normalization &op) override;
   void visit(const ir::operation::MatrixBandPart &op) override;
   void visit(const ir::operation::Max &op) override;
   void visit(const ir::operation::Min &op) override;
diff --git a/runtime/onert/core/include/exec/DynamicShapeInference.h b/runtime/onert/core/include/exec/DynamicShapeInference.h
index 113c348..601c1bf 100644
--- a/runtime/onert/core/include/exec/DynamicShapeInference.h
+++ b/runtime/onert/core/include/exec/DynamicShapeInference.h
@@ -72,6 +72,7 @@ public:
   void visit(const ir::operation::LogicalNot &op) override;
   void visit(const ir::operation::LogicalOr &op) override;
   void visit(const ir::operation::Logistic &op) override;
+  void visit(const ir::operation::L2Normalization &op) override;
   void visit(const ir::operation::MatrixBandPart &op) override;
   void visit(const ir::operation::Max &op) override;
   void visit(const ir::operation::Min &op) override;
diff --git a/runtime/onert/core/include/ir/Operations.Include.h b/runtime/onert/core/include/ir/Operations.Include.h
index 5fac54e..e3b5d19 100644
--- a/runtime/onert/core/include/ir/Operations.Include.h
+++ b/runtime/onert/core/include/ir/Operations.Include.h
@@ -103,3 +103,4 @@
 #include "ir/operation/BatchMatMul.h"
 #include "ir/operation/FusedBatchNorm.h"
 #include "ir/operation/LogSoftmax.h"
+#include "ir/operation/Quantize.h"
diff --git a/runtime/onert/core/include/ir/Operations.lst b/runtime/onert/core/include/ir/Operations.lst
index 9d0642f..03a2aa2 100644
--- a/runtime/onert/core/include/ir/Operations.lst
+++ b/runtime/onert/core/include/ir/Operations.lst
@@ -106,3 +106,4 @@ OP(MatrixBandPart)
 OP(BatchMatMul)
 OP(FusedBatchNorm)
 OP(LogSoftmax)
+OP(Quantize)
diff --git a/runtime/onert/core/include/ir/operation/LogSoftmax.h b/runtime/onert/core/include/ir/operation/LogSoftmax.h
index 26a92d7..391b4ba 100644
--- a/runtime/onert/core/include/ir/operation/LogSoftmax.h
+++ b/runtime/onert/core/include/ir/operation/LogSoftmax.h
@@ -48,7 +48,7 @@ public:
 
 public:
   void accept(OperationVisitor &v) const override;
-  OpCode opcode() const final { return OpCode::Softmax; }
+  OpCode opcode() const final { return OpCode::LogSoftmax; }
 
 public:
   const Param &param() const { return _param; }
diff --git a/runtime/onert/core/include/ir/operation/Pad.h b/runtime/onert/core/include/ir/operation/Pad.h
index a486061..00481cd 100644
--- a/runtime/onert/core/include/ir/operation/Pad.h
+++ b/runtime/onert/core/include/ir/operation/Pad.h
@@ -33,7 +33,7 @@ public:
   {
     INPUT = 0,
     PAD = 1,
-    // VALUE = 2 Not allow padding value operand yet
+    VALUE = 2
   };
 
 public:
diff --git a/runtime/onert/core/include/ir/operation/Quantize.h b/runtime/onert/core/include/ir/operation/Quantize.h
new file mode 100644
index 0000000..2533ce4
--- /dev/null
+++ b/runtime/onert/core/include/ir/operation/Quantize.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_IR_OPERATION_QUANTIZE_H__
+#define __ONERT_IR_OPERATION_QUANTIZE_H__
+
+#include "ir/Operation.h"
+
+namespace onert
+{
+namespace ir
+{
+namespace operation
+{
+
+class Quantize : public Operation
+{
+public:
+  enum Input
+  {
+    INPUT = 0,
+  };
+
+public:
+  Quantize(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs);
+
+public:
+  void accept(OperationVisitor &v) const override;
+  OpCode opcode() const final { return OpCode::Quantize; }
+};
+
+} // namespace operation
+} // namespace ir
+} // namespace onert
+
+#endif // __ONERT_IR_OPERATION_QUANTIZE_H__
diff --git a/runtime/onert/core/src/backend/controlflow/DynamicTensorManager.cc b/runtime/onert/core/src/backend/controlflow/DynamicTensorManager.cc
index 32a8041..c374aba 100644
--- a/runtime/onert/core/src/backend/controlflow/DynamicTensorManager.cc
+++ b/runtime/onert/core/src/backend/controlflow/DynamicTensorManager.cc
@@ -36,7 +36,7 @@ DynamicTensorManager::DynamicTensorManager(const std::shared_ptr<cpu_common::Ten
 void DynamicTensorManager::applyShape(const ir::OperandIndex &ind, const ir::Shape &new_shape)
 {
   // NOTE Handle user tensors first
-  auto user_tensor = _user_tensors->getManagedTensor(ind);
+  auto user_tensor = _user_tensors->getNativeTensor(ind);
   if (user_tensor)
   {
     // User tensors cannot be reallocated.
@@ -47,8 +47,8 @@ void DynamicTensorManager::applyShape(const ir::OperandIndex &ind, const ir::Sha
     user_tensor->setShape(new_shape);
   }
 
-  // NOTE Then handle managed tensors
-  auto tensor = _tensors->getManagedTensor(ind);
+  // NOTE Then handle native tensors
+  auto tensor = _tensors->getNativeTensor(ind);
   assert(tensor);
 
   bool previously_dynamic = tensor->is_dynamic();
@@ -101,9 +101,9 @@ void DynamicTensorManager::buildTensor(const ir::OperandIndex &ind,
                                        const ir::OperandInfo &tensor_info,
                                        ir::Layout backend_layout)
 {
-  assert(_tensors->getManagedTensor(ind) == nullptr);
+  assert(_tensors->getNativeTensor(ind) == nullptr);
   auto tensor = std::make_shared<cpu_common::Tensor>(tensor_info, backend_layout);
-  _tensors->setManagedTensor(ind, tensor);
+  _tensors->setNativeTensor(ind, tensor);
 }
 
 void DynamicTensorManager::planDealloc(ir::OperationIndex op_ind, ir::OperandIndex operand_ind)
@@ -130,7 +130,7 @@ void DynamicTensorManager::deallocInput(ir::OperationIndex op_ind)
   auto &input_set = find->second;
   for (auto input_ind : input_set)
   {
-    if (!_tensors->getManagedTensor(input_ind)->is_dynamic())
+    if (!_tensors->getNativeTensor(input_ind)->is_dynamic())
       continue;
 
     _dynamic_mem_mgr->deallocate(input_ind);
@@ -141,7 +141,7 @@ void DynamicTensorManager::deallocInput(ir::OperationIndex op_ind)
 
 void DynamicTensorManager::deallocSubgraphOutput(ir::OperandIndex output_ind)
 {
-  if (!_tensors->getManagedTensor(output_ind)->is_dynamic())
+  if (!_tensors->getNativeTensor(output_ind)->is_dynamic())
     return;
 
   _dynamic_mem_mgr->deallocate(output_ind);
diff --git a/runtime/onert/core/src/backend/controlflow/KernelGenerator.cc b/runtime/onert/core/src/backend/controlflow/KernelGenerator.cc
index 4b683fb..eb83b7d 100644
--- a/runtime/onert/core/src/backend/controlflow/KernelGenerator.cc
+++ b/runtime/onert/core/src/backend/controlflow/KernelGenerator.cc
@@ -81,23 +81,23 @@ void KernelGenerator::visit(const ir::operation::If &node)
   std::vector<std::shared_ptr<backend::ITensor>> input_tensors;
   for (const auto input_index : node.getInputs())
   {
-    auto input_alloc = getTensor(input_index);
+    auto input_tensor = getTensor(input_index);
 
-    input_tensors.emplace_back(input_alloc);
+    input_tensors.emplace_back(input_tensor);
   }
 
   std::vector<std::shared_ptr<backend::ITensor>> output_tensors;
   exec::DynAllocInfoMap outputs_dyn_alloc_info;
   for (const auto output_index : node.getOutputs())
   {
-    auto output_alloc = getTensor(output_index);
+    auto output_tensor = getTensor(output_index);
 
-    output_tensors.emplace_back(output_alloc);
+    output_tensors.emplace_back(output_tensor);
     const auto output_tensor_builder = getTensorBuilder(output_index);
     if (output_tensor_builder->supportDynamicTensor())
     {
       auto output_dyn_manager = output_tensor_builder->dynamicTensorManager();
-      outputs_dyn_alloc_info[output_alloc] = exec::DynAllocInfo{output_index, output_dyn_manager};
+      outputs_dyn_alloc_info[output_tensor] = exec::DynAllocInfo{output_index, output_dyn_manager};
     }
   }
 
@@ -146,24 +146,24 @@ void KernelGenerator::visit(const ir::operation::While &node)
   std::vector<std::shared_ptr<backend::ITensor>> input_tensors;
   for (const auto input_index : node.getInputs())
   {
-    auto input_alloc = getTensor(input_index);
+    auto input_tensor = getTensor(input_index);
 
-    input_tensors.emplace_back(input_alloc);
+    input_tensors.emplace_back(input_tensor);
   }
 
   std::vector<std::shared_ptr<backend::ITensor>> output_tensors;
   std::unordered_map<std::shared_ptr<ITensor>, exec::DynAllocInfo> outputs_dyn_alloc_info;
   for (const auto output_index : node.getOutputs())
   {
-    auto output_alloc = getTensor(output_index);
+    auto output_tensor = getTensor(output_index);
 
-    output_tensors.emplace_back(output_alloc);
+    output_tensors.emplace_back(output_tensor);
 
     const auto output_tensor_builder = getTensorBuilder(output_index);
     if (output_tensor_builder->supportDynamicTensor())
     {
       auto output_dyn_manager = output_tensor_builder->dynamicTensorManager();
-      outputs_dyn_alloc_info[output_alloc] = exec::DynAllocInfo{output_index, output_dyn_manager};
+      outputs_dyn_alloc_info[output_tensor] = exec::DynAllocInfo{output_index, output_dyn_manager};
     }
   }
 
@@ -199,7 +199,7 @@ KernelGenerator::getTensorBuilder(const ir::OperandIndex &index)
   for (auto tensor_builder : _tensor_builder_set)
   {
     auto reg = tensor_builder->tensorRegistry();
-    auto tensor = reg ? reg->getManagedITensor(index) : tensor_builder->tensorAt(index);
+    auto tensor = reg ? reg->getNativeITensor(index) : tensor_builder->tensorAt(index);
     if (tensor)
     {
       ret = tensor_builder;
diff --git a/runtime/onert/core/src/backend/controlflow/TensorBuilder.cc b/runtime/onert/core/src/backend/controlflow/TensorBuilder.cc
index 16cd3ec..5bddb91 100644
--- a/runtime/onert/core/src/backend/controlflow/TensorBuilder.cc
+++ b/runtime/onert/core/src/backend/controlflow/TensorBuilder.cc
@@ -92,7 +92,7 @@ void TensorBuilder::allocate()
 std::shared_ptr<ITensor> TensorBuilder::tensorAt(const ir::OperandIndex &ind)
 {
   // NOTE Find from User Tensor Registry first
-  // FIXME There may be both user tensor and managed tensor for a `ind` which is a waste
+  // FIXME There may be both user tensor and native tensor for a `ind` which is a waste
   auto user_tensor = _user_tensor_reg->getITensor(ind);
   auto tensor = _tensor_reg->getITensor(ind);
   if (user_tensor)
@@ -107,7 +107,7 @@ void TensorBuilder::iterate(const IterateFunction &fn) { _static_tensor_mgr->ite
 
 std::shared_ptr<cpu_common::Tensor> TensorBuilder::at(const ir::OperandIndex &ind)
 {
-  return _tensor_reg->getManagedTensor(ind);
+  return _tensor_reg->getNativeTensor(ind);
 }
 
 std::unique_ptr<ITensorManager> TensorBuilder::releaseStaticTensorManager(void)
@@ -123,7 +123,7 @@ std::unique_ptr<ITensorManager> TensorBuilder::releaseDynamicTensorManager(void)
 void TensorBuilder::setUserTensor(const ir::OperandIndex &ind,
                                   const std::shared_ptr<UserTensor> &tensor)
 {
-  _user_tensor_reg->setManagedTensor(ind, tensor);
+  _user_tensor_reg->setNativeTensor(ind, tensor);
 }
 
 } // namespace controlflow
diff --git a/runtime/onert/core/src/backend/controlflow/UserTensor.h b/runtime/onert/core/src/backend/controlflow/UserTensor.h
index ce94ea0..b9b2d52 100644
--- a/runtime/onert/core/src/backend/controlflow/UserTensor.h
+++ b/runtime/onert/core/src/backend/controlflow/UserTensor.h
@@ -68,6 +68,7 @@ public:
   void set_dynamic() override { _dynamic = true; }
   ir::Shape getShape() const override { return _info.shape(); }
   void setShape(const ir::Shape &new_shape) override { _info.shape(new_shape); }
+  bool is_constant() const override { return false; }
 
 private:
   ir::OperandInfo _info;
diff --git a/runtime/onert/core/src/backend/cpu_common/DynamicTensorManager.cc b/runtime/onert/core/src/backend/cpu_common/DynamicTensorManager.cc
index 0ccf700..ede403b 100644
--- a/runtime/onert/core/src/backend/cpu_common/DynamicTensorManager.cc
+++ b/runtime/onert/core/src/backend/cpu_common/DynamicTensorManager.cc
@@ -35,7 +35,7 @@ void DynamicTensorManager::applyShape(const ir::OperandIndex &ind, const ir::Sha
 {
   VERBOSE_F() << ind << std::endl;
 
-  auto tensor = _tensors->getManagedTensor(ind);
+  auto tensor = _tensors->getNativeTensor(ind);
   assert(tensor);
 
   bool previously_dynamic = tensor->is_dynamic();
@@ -88,9 +88,9 @@ void DynamicTensorManager::buildTensor(const ir::OperandIndex &ind,
                                        const ir::OperandInfo &tensor_info,
                                        ir::Layout backend_layout)
 {
-  assert(_tensors->getManagedTensor(ind) == nullptr);
+  assert(_tensors->getNativeTensor(ind) == nullptr);
   auto tensor = std::make_shared<Tensor>(tensor_info, backend_layout);
-  _tensors->setManagedTensor(ind, tensor);
+  _tensors->setNativeTensor(ind, tensor);
 }
 
 void DynamicTensorManager::planDealloc(ir::OperationIndex op_ind, ir::OperandIndex operand_ind)
@@ -117,7 +117,7 @@ void DynamicTensorManager::deallocInput(ir::OperationIndex op_ind)
   auto &input_set = find->second;
   for (auto input_ind : input_set)
   {
-    auto *tensor = _tensors->getManagedTensor(input_ind).get();
+    auto *tensor = _tensors->getNativeTensor(input_ind).get();
     if (!tensor->is_dynamic())
       continue;
 
@@ -131,7 +131,7 @@ void DynamicTensorManager::deallocInput(ir::OperationIndex op_ind)
 
 void DynamicTensorManager::deallocSubgraphOutput(ir::OperandIndex output_ind)
 {
-  auto *tensor = _tensors->getManagedTensor(output_ind).get();
+  auto *tensor = _tensors->getNativeTensor(output_ind).get();
   if (!tensor->is_dynamic())
     return;
 
diff --git a/runtime/onert/core/src/backend/cpu_common/StaticTensorManager.cc b/runtime/onert/core/src/backend/cpu_common/StaticTensorManager.cc
index 47bea35..8604542 100644
--- a/runtime/onert/core/src/backend/cpu_common/StaticTensorManager.cc
+++ b/runtime/onert/core/src/backend/cpu_common/StaticTensorManager.cc
@@ -33,7 +33,7 @@ StaticTensorManager::StaticTensorManager(const std::shared_ptr<TensorRegistry> &
 
 void StaticTensorManager::allocateConsts(void)
 {
-  for (auto &pair : _tensors->managed_tensors())
+  for (auto &pair : _tensors->native_tensors())
   {
     const auto &ind = pair.first;
     auto tensor = pair.second;
@@ -42,9 +42,9 @@ void StaticTensorManager::allocateConsts(void)
       auto mem_alloc = _const_mgr->allocate(ind, tensor->total_size());
       tensor->setBuffer(mem_alloc);
       auto buffer = mem_alloc->base();
-      VERBOSE(CPU_StaticTensorManager) << "CONSTANT TENSOR(#" << ind.value()
-                                       << "): " << static_cast<void *>(buffer)
-                                       << "size : " << tensor->total_size() << std::endl;
+      VERBOSE(CPU_COMMON_StaticTensorManager) << "CONSTANT TENSOR(#" << ind.value()
+                                              << "): " << static_cast<void *>(buffer)
+                                              << "size : " << tensor->total_size() << std::endl;
     }
   }
 }
@@ -53,7 +53,7 @@ void StaticTensorManager::allocateNonconsts(void)
 {
   _nonconst_mgr->allocate();
 
-  for (auto &pair : _tensors->managed_tensors())
+  for (auto &pair : _tensors->native_tensors())
   {
     const auto &ind = pair.first;
     auto tensor = pair.second;
@@ -62,8 +62,8 @@ void StaticTensorManager::allocateNonconsts(void)
       auto *buffer = _nonconst_mgr->getBuffer(ind);
       tensor->setBuffer(buffer);
 
-      VERBOSE(CPU_StaticTensorManager) << "TENSOR(#" << ind.value()
-                                       << "): " << static_cast<void *>(buffer) << std::endl;
+      VERBOSE(CPU_COMMON_StaticTensorManager) << "TENSOR(#" << ind.value()
+                                              << "): " << static_cast<void *>(buffer) << std::endl;
     }
   }
 }
@@ -76,18 +76,18 @@ void StaticTensorManager::buildTensor(const ir::OperandIndex &ind,
                                       const ir::OperandInfo &tensor_info, ir::Layout backend_layout,
                                       bool as_const)
 {
-  assert(!_tensors->getManagedTensor(ind));
+  assert(!_tensors->getNativeTensor(ind));
   auto tensor = std::make_shared<Tensor>(tensor_info, backend_layout);
-  _tensors->setManagedTensor(ind, tensor);
+  _tensors->setNativeTensor(ind, tensor);
   _as_constants[ind] = as_const;
 }
 
 void StaticTensorManager::claimPlan(const ir::OperandIndex &ind, uint32_t size)
 {
-  assert(_tensors->getManagedTensor(ind));
+  assert(_tensors->getNativeTensor(ind));
 
   // This method is called only when a tensor has proper shape
-  assert(!_tensors->getManagedTensor(ind)->is_dynamic());
+  assert(!_tensors->getNativeTensor(ind)->is_dynamic());
 
   if (!_as_constants[ind])
     _nonconst_mgr->claimPlan(ind, size);
@@ -95,10 +95,10 @@ void StaticTensorManager::claimPlan(const ir::OperandIndex &ind, uint32_t size)
 
 void StaticTensorManager::releasePlan(const ir::OperandIndex &ind)
 {
-  assert(_tensors->getManagedTensor(ind));
+  assert(_tensors->getNativeTensor(ind));
 
   // This method is called only when a tensor has proper shape
-  assert(!_tensors->getManagedTensor(ind)->is_dynamic());
+  assert(!_tensors->getNativeTensor(ind)->is_dynamic());
 
   if (!_as_constants[ind])
     _nonconst_mgr->releasePlan(ind);
@@ -106,7 +106,7 @@ void StaticTensorManager::releasePlan(const ir::OperandIndex &ind)
 
 void StaticTensorManager::iterate(const std::function<void(const ir::OperandIndex &)> &fn)
 {
-  for (const auto &it : _tensors->managed_tensors())
+  for (const auto &it : _tensors->native_tensors())
     fn(it.first);
 }
 
diff --git a/runtime/onert/core/src/compiler/ExecutorFactory.cc b/runtime/onert/core/src/compiler/ExecutorFactory.cc
index f3f69ad..8439b6a 100644
--- a/runtime/onert/core/src/compiler/ExecutorFactory.cc
+++ b/runtime/onert/core/src/compiler/ExecutorFactory.cc
@@ -201,18 +201,35 @@ ExecutorFactory::initializeModelIOTensors(ir::LoweredGraph &lowered_graph,
     // Add tensor to controlflow TensorRegistry.
     cf_tensor_builder->setUserTensor(ind, tensor);
     ret.push_back(tensor);
-
-    // Set other tensors as external tensors
-    for (auto &tensor_builder : tensor_builders)
-    {
-      // FIXME This is a workaround registering all user tensors to all backends
-      // FIXME Handle when it is failed
-      tensor_builder->setExternalTensor(ind, tensor);
-    }
   }
   return ret;
 }
 
+void ExecutorFactory::prepareExternalTensors(ir::LoweredGraph &lowered_graph,
+                                             TensorBuilders &tensor_builders)
+{
+  lowered_graph.op_seqs().iterate(
+      [&](const ir::OpSequenceIndex &op_seq_index, const ir::OpSequence &op_seq) {
+        auto lower_info = lowered_graph.getLowerInfo(op_seq_index);
+        auto &backend_ctx = lowered_graph.backend_contexts().at(lower_info->backend());
+        for (auto ind : (op_seq.getInputs() + op_seq.getOutputs()) | ir::Remove::DUPLICATED |
+                            ir::Remove::UNDEFINED)
+        {
+          // If an OpSequence input/output tensor does not have a own tensor object,
+          // it must be using external tensors, so find the tensor from other tensor builders and
+          // set the tensor to this tensor builder if portable
+          if (!backend_ctx->tensor_builder->tensorAt(ind))
+          {
+            auto tensor = tensor_builders.getITensor(ind);
+            assert(tensor); // The tensor must have been created in one of TensorBuilders
+            auto ptensor = std::dynamic_pointer_cast<backend::IPortableTensor>(tensor);
+            if (ptensor)
+              backend_ctx->tensor_builder->setMigrantTensor(ind, ptensor);
+          }
+        }
+      });
+}
+
 exec::IExecutor *
 ExecutorFactory::createLinearExecutor(std::unique_ptr<ir::LoweredGraph> lowered_graph,
                                       const compiler::CompilerOptions &options,
@@ -265,6 +282,8 @@ ExecutorFactory::createLinearExecutor(std::unique_ptr<ir::LoweredGraph> lowered_
     tensor_builder->prepare();
   }
 
+  prepareExternalTensors(*lowered_graph, tensor_builders);
+
   ExecutionBuilder builder;
 
   // Generate kernels
@@ -367,6 +386,8 @@ exec::IExecutor *ExecutorFactory::createDataflowExecutor(
     tensor_builder->prepare();
   }
 
+  prepareExternalTensors(*lowered_graph, tensor_builders);
+
   ExecutionBuilder builder;
 
   // Generate kernels
diff --git a/runtime/onert/core/src/compiler/ExecutorFactory.h b/runtime/onert/core/src/compiler/ExecutorFactory.h
index 1e82b98..418e5a7 100644
--- a/runtime/onert/core/src/compiler/ExecutorFactory.h
+++ b/runtime/onert/core/src/compiler/ExecutorFactory.h
@@ -22,6 +22,7 @@
 #include "backend/ITensor.h"
 #include "exec/IExecutor.h"
 #include "ir/LoweredGraph.h"
+#include "TensorBuilders.h"
 
 namespace onert
 {
@@ -48,6 +49,8 @@ private:
   static std::vector<std::shared_ptr<backend::ITensor>>
   initializeModelIOTensors(ir::LoweredGraph &lowered_graph,
                            const ir::OperandIndexSequence &indices);
+  static void prepareExternalTensors(ir::LoweredGraph &lowered_graph,
+                                     TensorBuilders &tensor_builders);
   static exec::IExecutor *
   createLinearExecutor(std::unique_ptr<ir::LoweredGraph> lowered_graph,
                        const compiler::CompilerOptions &options,
diff --git a/runtime/onert/core/src/compiler/HEScheduler.h b/runtime/onert/core/src/compiler/HEScheduler.h
index f507539..d8ceca9 100644
--- a/runtime/onert/core/src/compiler/HEScheduler.h
+++ b/runtime/onert/core/src/compiler/HEScheduler.h
@@ -51,16 +51,12 @@ public:
    * @param[in] backend_resolver backend resolver
    */
   HEScheduler(const backend::BackendContexts &backend_contexts, const CompilerOptions &options)
-      : _backend_contexts{backend_contexts}, _is_supported{}, _backends_avail_time{}, _ops_eft{},
+      : _is_supported{}, _backends_avail_time{}, _ops_eft{},
         _op_to_rank{std::make_shared<ir::OperationIndexMap<int64_t>>()},
         _is_profiling_mode{options.he_profiling_mode},
         _is_linear_exec{options.executor == "Linear"},
         _is_parallel_exec{options.executor == "Parallel"}
   {
-    // Workaround to avoid unused-private-field warning
-    // TODO use _backend_contexts and remove workaround
-    (void)_backend_contexts;
-
     for (auto &entry : backend_contexts)
     {
       _all_backends.push_back(entry.first);
@@ -165,7 +161,6 @@ private:
   // whether it should assign these backends to these nodes:
   // * It stores false for unsupported nodes
   // * During rank calculation with enabled profiling mode it stores true for supported nodes
-  const backend::BackendContexts &_backend_contexts;
   std::unordered_map<const backend::Backend *, std::unordered_map<std::string, bool>> _is_supported;
   // Finishing and starting time of each backend
   std::unordered_map<const backend::Backend *, std::map<int64_t, int64_t>> _backends_avail_time;
@@ -175,8 +170,7 @@ private:
   std::unique_ptr<compiler::BackendResolver> _backend_resolver;
   std::unique_ptr<exec::ExecTime> _exec_time;
   const ir::Graph *_graph{nullptr};
-  std::vector<const backend::Backend *>
-      _all_backends; // TODO Remove this and use _backend_contexts instead
+  std::vector<const backend::Backend *> _all_backends;
   const backend::Backend *_cpu_backend{nullptr}; // TODO Change this to controlflow_backend
   bool _is_profiling_mode;
   bool _is_linear_exec;
diff --git a/runtime/onert/core/src/compiler/OperationValidator.cc b/runtime/onert/core/src/compiler/OperationValidator.cc
index 5c545ae..fa5ee27 100644
--- a/runtime/onert/core/src/compiler/OperationValidator.cc
+++ b/runtime/onert/core/src/compiler/OperationValidator.cc
@@ -41,6 +41,21 @@ OperationValidator::OperationValidator(const ir::Graph &graph)
 {
 }
 
+void OperationValidator::checkUnaryOp(const ir::Operation &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(0)};
+
+  // Check if I/O types match
+  OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == _ctx.at(input_index).typeInfo().type());
+
+  if (_ctx.at(output_index).info().isDynamic())
+    return;
+
+  // Check if I/O shapes match
+  OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
+}
+
 void OperationValidator::operator()()
 {
   // There is no reason for each subgraph to have subgraphs since compiler has subgraphs when
@@ -53,16 +68,7 @@ void OperationValidator::operator()()
       [&](const ir::OperationIndex &, const ir::Operation &node) { node.accept(*this); });
 }
 
-void OperationValidator::visit(const ir::operation::Abs &node)
-{
-  const auto output_index{node.getOutputs().at(0)};
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
-
-  const auto input_index{node.getInputs().at(0)};
-
-  OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
-}
+void OperationValidator::visit(const ir::operation::Abs &node) { checkUnaryOp(node); }
 
 void OperationValidator::visit(const ir::operation::AvgPool2D &node)
 {
@@ -292,17 +298,7 @@ void OperationValidator::visit(const ir::operation::RNN &node)
               num_units == _ctx.at(hidden_state_out_index).shape().dim(1));
 }
 
-void OperationValidator::visit(const ir::operation::Round &node)
-{
-  const auto output_index{node.getOutputs().at(0)};
-  const auto input_index{node.getInputs().at(ir::operation::Round::Input::INPUT)};
-
-  OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == _ctx.at(input_index).typeInfo().type());
-
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
-  OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
-}
+void OperationValidator::visit(const ir::operation::Round &node) { checkUnaryOp(node); }
 
 void OperationValidator::visit(const ir::operation::SpaceToBatchND &node)
 {
@@ -393,17 +389,7 @@ void OperationValidator::visit(const ir::operation::EmbeddingLookup &node)
   }
 }
 
-void OperationValidator::visit(const ir::operation::Exp &node)
-{
-  const auto output_index{node.getOutputs().at(0)};
-  const auto input_index{node.getInputs().at(ir::operation::Exp::Input::INPUT)};
-
-  OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == _ctx.at(input_index).typeInfo().type());
-
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
-  OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
-}
+void OperationValidator::visit(const ir::operation::Exp &node) { checkUnaryOp(node); }
 
 void OperationValidator::visit(const ir::operation::ExpandDims &node)
 {
@@ -419,17 +405,7 @@ void OperationValidator::visit(const ir::operation::ExpandDims &node)
   OP_REQUIRES(_ctx.at(axis_index).shape().rank() <= 1);
 }
 
-void OperationValidator::visit(const ir::operation::Floor &node)
-{
-  const auto output_index{node.getOutputs().at(0)};
-  const auto input_index{node.getInputs().at(ir::operation::Floor::Input::INPUT)};
-
-  OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == _ctx.at(input_index).typeInfo().type());
-
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
-  OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
-}
+void OperationValidator::visit(const ir::operation::Floor &node) { checkUnaryOp(node); }
 
 void OperationValidator::visit(const ir::operation::HashtableLookup &node)
 {
@@ -789,6 +765,25 @@ void OperationValidator::visit(const ir::operation::LSTM &node)
   }
 }
 
+void OperationValidator::visit(const ir::operation::L2Normalization &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  if (_ctx.at(ofm_index).info().isDynamic())
+    return;
+
+  const auto ifm_index{node.getInputs().at(ir::operation::L2Normalization::Input::INPUT)};
+
+  auto ifm_shape = _ctx.at(ifm_index).shape();
+  auto ofm_shape = _ctx.at(ofm_index).shape();
+
+  OP_REQUIRES(ifm_shape.rank() == ofm_shape.rank());
+
+  for (auto i = 0; i < ifm_shape.rank(); i++)
+  {
+    OP_REQUIRES(ifm_shape.dim(i) == ofm_shape.dim(i));
+  }
+}
+
 void OperationValidator::visit(const ir::operation::Unpack &node)
 {
   const auto num{node.param().num};
@@ -904,35 +899,11 @@ void OperationValidator::visit(const ir::operation::Split &node)
   OP_REQUIRES(_ctx.at(input_index).shape().dim(axis) % num_splits == 0);
 }
 
-void OperationValidator::visit(const ir::operation::Cos &node)
-{
-  const auto output_index{node.getOutputs().at(0)};
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
-
-  const auto input_index{node.getInputs().at(0)};
-  OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
-}
-
-void OperationValidator::visit(const ir::operation::Sin &node)
-{
-  const auto output_index{node.getOutputs().at(0)};
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
+void OperationValidator::visit(const ir::operation::Cos &node) { checkUnaryOp(node); }
 
-  const auto input_index{node.getInputs().at(0)};
-  OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
-}
+void OperationValidator::visit(const ir::operation::Sin &node) { checkUnaryOp(node); }
 
-void OperationValidator::visit(const ir::operation::RSQRT &node)
-{
-  const auto output_index{node.getOutputs().at(0)};
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
-
-  const auto input_index{node.getInputs().at(0)};
-  OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
-}
+void OperationValidator::visit(const ir::operation::RSQRT &node) { checkUnaryOp(node); }
 
 void OperationValidator::visit(const ir::operation::Shape &node)
 {
@@ -972,35 +943,11 @@ void OperationValidator::visit(const ir::operation::While &node)
   // TODO Add to validate with subgraphs
 }
 
-void OperationValidator::visit(const ir::operation::Neg &node)
-{
-  const auto output_index{node.getOutputs().at(0)};
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
+void OperationValidator::visit(const ir::operation::Neg &node) { checkUnaryOp(node); }
 
-  const auto input_index{node.getInputs().at(0)};
-  OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
-}
+void OperationValidator::visit(const ir::operation::Log &node) { checkUnaryOp(node); }
 
-void OperationValidator::visit(const ir::operation::Log &node)
-{
-  const auto output_index{node.getOutputs().at(0)};
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
-
-  const auto input_index{node.getInputs().at(0)};
-  OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
-}
-
-void OperationValidator::visit(const ir::operation::LogicalNot &node)
-{
-  const auto output_index{node.getOutputs().at(0)};
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
-
-  const auto input_index{node.getInputs().at(0)};
-  OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
-}
+void OperationValidator::visit(const ir::operation::LogicalNot &node) { checkUnaryOp(node); }
 
 void OperationValidator::visit(const ir::operation::SquaredDifference &node)
 {
@@ -1118,5 +1065,25 @@ void OperationValidator::visit(const ir::operation::LogSoftmax &node)
 
   OP_REQUIRES(_ctx.at(output_index).shape().rank() == _ctx.at(input_index).shape().rank());
 }
+
+void OperationValidator::visit(const ir::operation::Quantize &node)
+{
+  VERBOSE(Quantize) << "Configure Quantize operation" << std::endl;
+
+  OP_REQUIRES(node.getInputs().size() == 1);
+  OP_REQUIRES(node.getOutputs().size() == 1);
+
+  const auto input_index{node.getInputs().at(0)};
+  const auto output_index{node.getOutputs().at(0)};
+
+  OP_REQUIRES(_ctx.at(input_index).typeInfo().type() == ir::DataType::FLOAT32);
+
+  if (_ctx.at(output_index).info().isDynamic())
+    return;
+
+  OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == ir::DataType::QUANT_UINT8_ASYMM);
+
+  OP_REQUIRES(_ctx.at(output_index).shape().rank() == _ctx.at(input_index).shape().rank());
+}
 } // namespace compiler
 } // namespace onert
diff --git a/runtime/onert/core/src/compiler/OperationValidator.h b/runtime/onert/core/src/compiler/OperationValidator.h
index 6ceafe8..55a4dd5 100644
--- a/runtime/onert/core/src/compiler/OperationValidator.h
+++ b/runtime/onert/core/src/compiler/OperationValidator.h
@@ -70,6 +70,7 @@ public:
   void visit(const ir::operation::DepthToSpace &node) override;
   void visit(const ir::operation::Pack &node) override;
   void visit(const ir::operation::LSTM &node) override;
+  void visit(const ir::operation::L2Normalization &node) override;
   void visit(const ir::operation::Unpack &node) override;
   void visit(const ir::operation::Pad &node) override;
   void visit(const ir::operation::Min &node) override;
@@ -93,9 +94,10 @@ public:
   void visit(const ir::operation::Range &node) override;
   void visit(const ir::operation::MatrixBandPart &node) override;
   void visit(const ir::operation::LogSoftmax &node) override;
+  void visit(const ir::operation::Quantize &node) override;
 
 private:
-  void checkReduceOp(const ir::OperandIndex input_index, const ir::OperandIndex output_index);
+  void checkUnaryOp(const ir::Operation &node);
 
 private:
   // TODO Remove _ctx field
diff --git a/runtime/onert/core/src/compiler/StaticShapeInference.cc b/runtime/onert/core/src/compiler/StaticShapeInference.cc
index 5a58f2e..66de599 100644
--- a/runtime/onert/core/src/compiler/StaticShapeInference.cc
+++ b/runtime/onert/core/src/compiler/StaticShapeInference.cc
@@ -497,6 +497,11 @@ void StaticShapeInferer::visit(const ir::operation::Logistic &op)
   handleSimpleUnaryOp(op, op.getInputs().at(ir::operation::Logistic::Input::INPUT));
 }
 
+void StaticShapeInferer::visit(const ir::operation::L2Normalization &op)
+{
+  handleSimpleUnaryOp(op, op.getInputs().at(ir::operation::L2Normalization::Input::INPUT));
+}
+
 void StaticShapeInferer::visit(const ir::operation::MatrixBandPart &op)
 {
   handleSimpleUnaryOp(op, op.getInputs().at(ir::operation::MatrixBandPart::Input::INPUT));
diff --git a/runtime/onert/core/src/compiler/TensorBuilders.h b/runtime/onert/core/src/compiler/TensorBuilders.h
index 4bb7413..c0a1ebc 100644
--- a/runtime/onert/core/src/compiler/TensorBuilders.h
+++ b/runtime/onert/core/src/compiler/TensorBuilders.h
@@ -23,6 +23,7 @@
 #include "backend/Backend.h"
 #include "backend/controlflow/Config.h"
 #include "backend/controlflow/TensorBuilder.h"
+#include "util/logging.h"
 
 namespace onert
 {
@@ -66,6 +67,17 @@ public:
     return _cf_tensor_builder;
   }
 
+  std::shared_ptr<backend::ITensor> getITensor(ir::OperandIndex ind)
+  {
+    for (auto &tensor_builder : _tensor_builders)
+    {
+      auto tensor = tensor_builder->tensorAt(ind);
+      if (tensor)
+        return tensor;
+    }
+    return nullptr;
+  }
+
 private:
   std::unordered_set<std::shared_ptr<backend::ITensorBuilder>> _tensor_builders;
   std::shared_ptr<backend::controlflow::TensorBuilder> _cf_tensor_builder;
diff --git a/runtime/onert/core/src/exec/DynamicShapeInference.cc b/runtime/onert/core/src/exec/DynamicShapeInference.cc
index 1b82029..28e92ba 100644
--- a/runtime/onert/core/src/exec/DynamicShapeInference.cc
+++ b/runtime/onert/core/src/exec/DynamicShapeInference.cc
@@ -442,6 +442,11 @@ void DynamicShapeInferer::visit(const ir::operation::Logistic &op)
   handleSimpleUnaryOp(op, op.getInputs().at(ir::operation::Logistic::INPUT));
 }
 
+void DynamicShapeInferer::visit(const ir::operation::L2Normalization &op)
+{
+  handleSimpleUnaryOp(op, op.getInputs().at(ir::operation::L2Normalization::INPUT));
+}
+
 void DynamicShapeInferer::visit(const ir::operation::MatrixBandPart &op)
 {
   handleSimpleUnaryOp(op, op.getInputs().at(ir::operation::MatrixBandPart::INPUT));
diff --git a/runtime/onert/core/src/exec/ExecutorBase.cc b/runtime/onert/core/src/exec/ExecutorBase.cc
index a7409b9..864ccb3 100644
--- a/runtime/onert/core/src/exec/ExecutorBase.cc
+++ b/runtime/onert/core/src/exec/ExecutorBase.cc
@@ -46,7 +46,7 @@ ExecutorBase::ExecutorBase(std::unique_ptr<ir::LoweredGraph> &&lowered_graph,
         {
           auto tensor_registry = tensor_builder->tensorRegistry();
           assert(tensor_registry);
-          tensor = tensor_registry->getManagedITensor(ind);
+          tensor = tensor_registry->getNativeITensor(ind);
           if (tensor != nullptr)
           {
             if (tensor_builder->supportDynamicTensor())
@@ -71,7 +71,7 @@ ExecutorBase::ExecutorBase(std::unique_ptr<ir::LoweredGraph> &&lowered_graph,
         {
           auto tensor_registry = tensor_builder->tensorRegistry();
           assert(tensor_registry);
-          tensor = tensor_registry->getManagedITensor(ind);
+          tensor = tensor_registry->getNativeITensor(ind);
           if (tensor != nullptr)
           {
             if (tensor_builder->supportDynamicTensor())
diff --git a/runtime/onert/core/src/interp/operations/Pad.cc b/runtime/onert/core/src/interp/operations/Pad.cc
index d2e3627..c8dce69 100644
--- a/runtime/onert/core/src/interp/operations/Pad.cc
+++ b/runtime/onert/core/src/interp/operations/Pad.cc
@@ -69,8 +69,8 @@ void invoke(const ITensor *input_tensor, const ITensor *pad_tensor, const ITenso
   const int32_t *pad_ptr = reinterpret_cast<const int32_t *>(pad_buffer);
   float *output_ptr = reinterpret_cast<float *>(output_buffer);
 
-  nnfw::cker::Pad(pad_ptr, pad_rank, cker_input_shape, input_ptr, cker_output_shape, output_ptr,
-                  nullptr);
+  nnfw::cker::Pad<float>(pad_ptr, pad_rank, cker_input_shape, input_ptr, cker_output_shape,
+                         output_ptr, nullptr);
 }
 
 void invokePad(const ExecEnv *env, const ir::Operation &node)
diff --git a/runtime/onert/core/src/ir/LoweredGraph.cc b/runtime/onert/core/src/ir/LoweredGraph.cc
index 6e93a23..f138089 100644
--- a/runtime/onert/core/src/ir/LoweredGraph.cc
+++ b/runtime/onert/core/src/ir/LoweredGraph.cc
@@ -122,9 +122,6 @@ LoweredGraph::LoweredGraph(const Graph &graph, const compiler::CompilerOptions &
 
     pass::PermutationInsertionPass pi_pass(*this);
     pi_pass.run();
-    // Implemented code no longer works.
-    // pass::PermutationEliminationPass pe_pass(*this);
-    // pe_pass.run();
 
     _op_seqs.dump("merged and sorted operations with permutation", _graph.operations());
   }
diff --git a/runtime/onert/core/src/ir/operation/Quantize.cc b/runtime/onert/core/src/ir/operation/Quantize.cc
new file mode 100644
index 0000000..0e3d5b6
--- /dev/null
+++ b/runtime/onert/core/src/ir/operation/Quantize.cc
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ir/operation/Quantize.h"
+
+#include "ir/OperationVisitor.h"
+
+namespace onert
+{
+namespace ir
+{
+namespace operation
+{
+
+void Quantize::accept(OperationVisitor &v) const { v.visit(*this); }
+
+Quantize::Quantize(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs)
+    : Operation{OperandConstraint::createExact(2u), inputs, outputs}
+{
+}
+
+} // namespace operation
+} // namespace ir
+} // namespace onert
diff --git a/runtime/onert/core/src/ir/pass/PermutationEliminationPass.cc b/runtime/onert/core/src/ir/pass/PermutationEliminationPass.cc
deleted file mode 100644
index 9e0291e..0000000
--- a/runtime/onert/core/src/ir/pass/PermutationEliminationPass.cc
+++ /dev/null
@@ -1,195 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "PermutationEliminationPass.h"
-
-#include "ir/Operand.h"
-#include "ir/operand/LowerInfo.h"
-#include "ir/Graph.h"
-#include "backend/IConfig.h"
-#include "util/logging.h"
-
-namespace onert
-{
-namespace ir
-{
-namespace pass
-{
-void PermutationEliminationPass::callback(const OperandIndex &inp_index, Operand &object)
-{
-  if (_graph.getInputs().contains(inp_index))
-  {
-    eliminateInput(inp_index, object);
-  }
-  else if (_graph.getOutputs().contains(inp_index))
-  {
-    eliminateOutput(inp_index, object);
-  }
-}
-
-void PermutationEliminationPass::eliminateInput(const OperandIndex &inp_index, Operand &object)
-{
-  auto &model_inputs = _graph.getInputs();
-
-  // get uses of the model's given input
-  auto uses = object.getUses();
-
-  // input must be used just by permutation
-  if (uses.size() != 1)
-  {
-    return;
-  }
-
-  for (auto input_use : uses)
-  {
-    auto &perm_operation = _graph.operations().at(input_use);
-    auto perm_inputs = perm_operation.getInputs();
-
-    auto perm_outputs = perm_operation.getOutputs();
-
-    if (!isPermuteLayerToEliminate(perm_inputs, perm_outputs, true))
-    {
-      return;
-    }
-
-    assert(perm_inputs.at(0) == inp_index);
-
-    VERBOSE(PermutationEliminationPass::EliminateInput) << "remove NHWC_TO_NCHW permutation\n";
-
-    // set model's new input, which was output of permutation
-    model_inputs.replace(inp_index, perm_outputs.at(0));
-
-    // remove model's input, which is also input of permutation
-    _graph.removeOperand(inp_index);
-
-    // remove permutation operation
-    assert(_lowered_graph.op_seqs().containsOperation(input_use));
-    auto op_seq_idx = _lowered_graph.op_seqs().getOperation(input_use);
-    _lowered_graph.op_seqs().remove(op_seq_idx);
-    _graph.operations().remove(input_use);
-
-    VERBOSE(PermutationEliminationPass::EliminateInput)
-        << inp_index.value() << " is model's input and is removed. New input is "
-        << perm_outputs.at(0).value() << "\n"
-        << input_use.value() << " is removed permutation operation\n";
-  }
-}
-
-void PermutationEliminationPass::eliminateOutput(const OperandIndex &out_index, Operand &object)
-{
-  auto &model_outputs = _graph.getOutputs();
-
-  // get defs of the model's given output
-  auto defs = object.getDef();
-
-  // output must use just permutation
-  if (defs.size() != 1)
-  {
-    return;
-  }
-
-  for (auto output_def : defs)
-  {
-    auto &perm_operation = _graph.operations().at(output_def);
-    auto perm_outputs = perm_operation.getOutputs();
-
-    auto perm_inputs = perm_operation.getInputs();
-    if (!isPermuteLayerToEliminate(perm_inputs, perm_outputs, false))
-    {
-      return;
-    }
-
-    assert(perm_outputs.at(0) == out_index);
-
-    VERBOSE(PermutationEliminationPass::EliminateOutput) << "remove NCHW_TO_NHWC permutation\n";
-
-    // Update operations' output that is used by permute operand
-    for (auto perm_input_index : perm_inputs)
-    {
-      auto &perm_input_operand = _graph.operands().at(perm_input_index);
-      perm_input_operand.removeUse(output_def);
-    }
-
-    // set model's new output, which was input of permutation
-    model_outputs.replace(out_index, perm_inputs.at(0));
-
-    // remove model's output, which is also output of permutation
-    _graph.removeOperand(out_index);
-
-    // remove permutation operation
-    assert(_lowered_graph.op_seqs().containsOperation(output_def));
-    auto op_seq_idx = _lowered_graph.op_seqs().getOperation(output_def);
-    _lowered_graph.op_seqs().remove(op_seq_idx);
-    _graph.operations().remove(output_def);
-
-    VERBOSE(PermutationEliminationPass::EliminateOutput)
-        << out_index.value() << " is model's output and is removed. New output is "
-        << perm_inputs.at(0).value() << "\n"
-        << output_def.value() << " is removed permutation operation\n";
-  }
-}
-
-bool PermutationEliminationPass::isPermuteLayerToEliminate(const OperandIndexSequence &inp_indexes,
-                                                           const OperandIndexSequence &out_indexes,
-                                                           bool is_for_model_input)
-{
-  auto input_def_factors = _lowered_graph.getLowerInfo(inp_indexes.at(0))->def_factors();
-  auto output_def_factors = _lowered_graph.getLowerInfo(out_indexes.at(0))->def_factors();
-
-  auto input_layout = input_def_factors.getOnlyElement().layout();
-  auto output_layout = output_def_factors.getOnlyElement().layout();
-
-  if (input_def_factors.size() != 1 || output_def_factors.size() != 1)
-  {
-    return false;
-  }
-
-  // all operands' factor must be the same
-  for (auto index : inp_indexes)
-  {
-    auto op_factor_set = _lowered_graph.getLowerInfo(index)->def_factors();
-    if (op_factor_set.size() != 1 ||
-        input_layout != _lowered_graph.getLowerInfo(index)->def_factors().getOnlyElement().layout())
-    {
-      return false;
-    }
-  }
-  // all operands' factor must be the same
-  for (auto index : out_indexes)
-  {
-    auto op_factor_set = _lowered_graph.getLowerInfo(index)->def_factors();
-    if (op_factor_set.size() != 1 ||
-        output_layout !=
-            _lowered_graph.getLowerInfo(index)->def_factors().getOnlyElement().layout())
-    {
-      return false;
-    }
-  }
-
-  if (is_for_model_input)
-  {
-    // check if this is NHWC_TO_NCHW permutation: must have single input, which is model's input
-    return (inp_indexes.size() == 1 && input_layout == Layout::NHWC &&
-            output_layout == Layout::NCHW);
-  }
-
-  // check if this is NCHW_TO_NHWC permutation: must have single output, which is model's output
-  return (out_indexes.size() == 1 && input_layout == Layout::NCHW && output_layout == Layout::NHWC);
-}
-
-} // namespace pass
-} // namespace ir
-} // namespace onert
diff --git a/runtime/onert/core/src/ir/pass/PermutationEliminationPass.h b/runtime/onert/core/src/ir/pass/PermutationEliminationPass.h
deleted file mode 100644
index 1c84300..0000000
--- a/runtime/onert/core/src/ir/pass/PermutationEliminationPass.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __ONERT_GRAPH_PASS_PERMUTATION_ELIMINATION_PASS_H__
-#define __ONERT_GRAPH_PASS_PERMUTATION_ELIMINATION_PASS_H__
-
-#include "LoweredOperandPass.h"
-#include "ir/Operand.h"
-#include "ir/OperandIndexSequence.h"
-
-namespace onert
-{
-namespace ir
-{
-namespace pass
-{
-
-class PermutationEliminationPass : public LoweredOperandPass
-{
-public:
-  using LoweredOperandPass::LoweredOperandPass;
-
-public:
-  std::string id() override { return "PermutationEliminationPass"; }
-
-  void callback(const OperandIndex &index, Operand &object) override;
-
-private:
-  /**
-   * @brief Remove Permute operation that permutates input
-   *
-   * Note: This function aslo removes model's input and
-   * sets output of permutation as model's new input
-   *
-   * @param inp_index is the target operand index for the elimination
-   * @param object is the target operand object for the elimination
-   *
-   * @return
-   */
-  void eliminateInput(const OperandIndex &inp_index, Operand &object);
-
-  /**
-   * @brief Remove Permute operation that permutates output of a model
-   *
-   * Note: This function aslo removes model's output and
-   * sets input of permutation as model's new output
-   *
-   * @param out_index is the target operand index for the elimination
-   * @param object is the target operand object for the elimination
-   *
-   * @return
-   */
-  void eliminateOutput(const OperandIndex &out_index, Operand &object);
-
-  /**
-   * @brief Determine if passed operands are permute layer's input and output, that must be
-   * eliminated
-   *
-   * @param inp_index indexes of the input operand to operation
-   * @param out_index indexes of the output operand to operation
-   * @param is_for_model_input checking for model's input or output
-   *
-   * @return if it is permutation layer
-   */
-  bool isPermuteLayerToEliminate(const OperandIndexSequence &inp_indexes,
-                                 const OperandIndexSequence &out_indexes, bool is_for_model_input);
-};
-
-} // namespace pass
-} // namespace ir
-} // namespace onert
-
-#endif // __ONERT_GRAPH_PASS_PERMUTATION_ELIMINATION_PASS_H__
diff --git a/runtime/onert/core/src/ir/pass/PermutationInsertionPass.cc b/runtime/onert/core/src/ir/pass/PermutationInsertionPass.cc
index 7c3da52..75efdd8 100644
--- a/runtime/onert/core/src/ir/pass/PermutationInsertionPass.cc
+++ b/runtime/onert/core/src/ir/pass/PermutationInsertionPass.cc
@@ -62,27 +62,26 @@ void PermutationInsertionPass::callback(const OperandIndex &index, Operand &obje
     auto insert_set = operand_li->use_factors() - operand_li->def_factors();
     auto def_factor = operand_li->def_factors().getOnlyElement();
 
-    auto compatible_backends = [](auto /* backend1 */, auto /* backend2 */) {
-      // TODO If other issues for Permute elimination are resolved, enable this
-      return false;
-      /*
+    auto compatible_backends = [](auto backend1, auto backend2) {
       // TODO This is a workaround for not inserting Permute between cpu and controlflow.
       //      To be general, we need another way of checking they are compatible.
       const auto cf = backend::controlflow::Config::ID;
       const auto cpu = "cpu";
       const auto id1 = backend1->config()->id();
       const auto id2 = backend2->config()->id();
-      return (id1 == cpu && id2 == cf) // Allows no-Permute for Model inputs
-          || (id1 == cf && id2 == cpu); // Allows no-Permute for Model outputs
-          */
+      // NOTE This is to skip Permute insertion for model inputs(controlflow -> cpu), but not
+      // outputs. This function currently assumes that backend1 is Def and backend2 is Use. However
+      // it is going to be fixed soon.
+      // TODO make both ways work
+      return (id1 == cpu && id2 == cf);
     };
 
     for (auto factor : insert_set)
     {
+      // Check exceptional cases that Permute ops are not inserted
       if (factor.layout() == def_factor.layout() &&
           compatible_backends(factor.backend(), def_factor.backend()))
       {
-        // For this factor we can just reuse existing operand - Permute is not added.
         VERBOSE(PermutationInsertionPass) << "Permutation Insertion is skipped for operand "
                                           << index << " / as the tensor is compatible with backend "
                                           << factor.backend()->config()->id() << std::endl;
diff --git a/runtime/onert/frontend/base_loader/include/base_loader.h b/runtime/onert/frontend/base_loader/include/base_loader.h
index f5687ad..f763346 100644
--- a/runtime/onert/frontend/base_loader/include/base_loader.h
+++ b/runtime/onert/frontend/base_loader/include/base_loader.h
@@ -171,6 +171,8 @@ protected:
   void loadBroadcastTo(const Operator *op, ir::Graph &subg);
   void loadFusedBatchNorm(const Operator *op, ir::Graph &subg);
   void loadLogSoftmax(const Operator *op, ir::Graph &subg);
+  void loadQuantize(const Operator *op, ir::Graph &subg);
+  void loadSpaceToDepth(const Operator *op, ir::Graph &subg);
 
 protected:
   // Base address for mapped region for loading (if needed)
@@ -1123,6 +1125,22 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadBroadcastTo(const Operator *o
   std::unique_ptr<ir::Operation> new_op(new ir::operation::BroadcastTo(inputs, outputs));
   subg.addOperation(std::move(new_op));
 }
+template <typename LoaderDomain, typename SpecificLoader>
+void BaseLoader<LoaderDomain, SpecificLoader>::loadSpaceToDepth(const Operator *op, ir::Graph &subg)
+{
+  ir::OperandIndexSequence inputs;
+  ir::OperandIndexSequence outputs;
+  ir::operation::SpaceToDepth::Param param;
+
+  const auto *options = op->builtin_options_as_SpaceToDepthOptions();
+
+  param.block_size = options->block_size();
+
+  loadOperationIO(op, inputs, outputs);
+
+  std::unique_ptr<ir::Operation> new_op(new ir::operation::SpaceToDepth(inputs, outputs, param));
+  subg.addOperation(std::move(new_op));
+}
 
 template <typename LoaderDomain, typename SpecificLoader>
 void BaseLoader<LoaderDomain, SpecificLoader>::loadCustom(const Operator *op, ir::Graph &subg)
@@ -1743,6 +1761,18 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadLogSoftmax(const Operator *op
 }
 
 template <typename LoaderDomain, typename SpecificLoader>
+void BaseLoader<LoaderDomain, SpecificLoader>::loadQuantize(const Operator *op, ir::Graph &subg)
+{
+  ir::OperandIndexSequence inputs;
+  ir::OperandIndexSequence outputs;
+
+  loadOperationIO(op, inputs, outputs);
+
+  std::unique_ptr<ir::Operation> new_op(new ir::operation::Quantize(inputs, outputs));
+  subg.addOperation(std::move(new_op));
+}
+
+template <typename LoaderDomain, typename SpecificLoader>
 void BaseLoader<LoaderDomain, SpecificLoader>::loadOperation(const Operator *op, ir::Graph &subg)
 {
   const auto builtin_op = _model->operator_codes()->Get(op->opcode_index())->builtin_code();
@@ -1959,6 +1989,12 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadOperation(const Operator *op,
     case BuiltinOperator::BuiltinOperator_LOG_SOFTMAX:
       loadLogSoftmax(op, subg);
       return;
+    case BuiltinOperator::BuiltinOperator_QUANTIZE:
+      loadQuantize(op, subg);
+      return;
+    case BuiltinOperator::BuiltinOperator_SPACE_TO_DEPTH:
+      loadSpaceToDepth(op, subg);
+      return;
     default:
       throw std::runtime_error(
           std::string("Unsupported operation: ").append(EnumNameBuiltinOperator(builtin_op)));
diff --git a/runtime/onert/frontend/nnapi/wrapper/OperationFactory.cc b/runtime/onert/frontend/nnapi/wrapper/OperationFactory.cc
index 94791f8..00ffcb6 100644
--- a/runtime/onert/frontend/nnapi/wrapper/OperationFactory.cc
+++ b/runtime/onert/frontend/nnapi/wrapper/OperationFactory.cc
@@ -106,6 +106,33 @@ getReduceGenerator(const onert::ir::operation::Reduce::ReduceType reduce_type)
   };
 }
 
+template <typename T>
+Operation *CreateSimpleUnaryOp(const OperationFactory::Param &init_param, Operands &)
+{
+  assert(init_param.input_count == 1 && init_param.output_count == 1);
+
+  OperandIndexSequence outputs{init_param.outputs[0]};
+
+  // Each input should be interpreted as follows:
+  //
+  //  0 -> Input Tensor Index
+  OperandIndexSequence inputs{init_param.inputs[0]};
+
+  return new T{inputs, outputs};
+}
+
+// A generator function for binary ops with no params
+template <typename T>
+Operation *createSimpleBinaryOp(const OperationFactory::Param &init_param, Operands &)
+{
+  assert(init_param.input_count == 2 && init_param.output_count == 1);
+
+  OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]};
+  OperandIndexSequence outputs{init_param.outputs[0]};
+
+  return new T{inputs, outputs};
+}
+
 } // namespace
 
 OperationFactory &OperationFactory::get()
@@ -116,20 +143,10 @@ OperationFactory &OperationFactory::get()
 
 OperationFactory::OperationFactory()
 {
-  _map[ANEURALNETWORKS_BATCH_TO_SPACE_ND] = [](const OperationFactory::Param &init_param,
-                                               Operands &) {
-    assert(init_param.input_count == 2 && init_param.output_count == 1);
-
-    OperandIndexSequence outputs{init_param.outputs[0]};
-
-    // Each input should be interpreted as follows:
-    //
-    //  0 -> Input Tensor Index
-    //  1 -> Block size Index
-    OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]};
-
-    return new operation::BatchToSpaceND{inputs, outputs};
-  };
+  // Each input should be interpreted as follows:
+  //  0 -> Input Tensor Index
+  //  1 -> Block size Index
+  _map[ANEURALNETWORKS_BATCH_TO_SPACE_ND] = createSimpleBinaryOp<operation::BatchToSpaceND>;
 
   _map[ANEURALNETWORKS_DEPTHWISE_CONV_2D] = [](const OperationFactory::Param &init_param,
                                                Operands &operands) {
@@ -724,44 +741,11 @@ OperationFactory::OperationFactory()
     return new operation::Squeeze{inputs, outputs, param};
   };
 
-  _map[ANEURALNETWORKS_TANH] = [](const OperationFactory::Param &init_param, Operands &) {
-    assert(init_param.input_count == 1 && init_param.output_count == 1);
-
-    OperandIndexSequence outputs{init_param.outputs[0]};
-
-    // Each input should be interpreted as follows:
-    //
-    //  0 -> Input Tensor Index
-    OperandIndexSequence inputs{init_param.inputs[0]};
-
-    return new operation::Tanh{inputs, outputs};
-  };
-
-  _map[ANEURALNETWORKS_LOG] = [](const OperationFactory::Param &init_param, Operands &) {
-    assert(init_param.input_count == 1 && init_param.output_count == 1);
+  _map[ANEURALNETWORKS_TANH] = CreateSimpleUnaryOp<operation::Tanh>;
 
-    OperandIndexSequence outputs{init_param.outputs[0]};
+  _map[ANEURALNETWORKS_LOG] = CreateSimpleUnaryOp<operation::Log>;
 
-    // Each input should be interpreted as follows:
-    //
-    //  0 -> Input Tensor Index
-    OperandIndexSequence inputs{init_param.inputs[0]};
-
-    return new operation::Log{inputs, outputs};
-  };
-
-  _map[ANEURALNETWORKS_LOGISTIC] = [](const OperationFactory::Param &init_param, Operands &) {
-    assert(init_param.input_count == 1 && init_param.output_count == 1);
-
-    OperandIndexSequence outputs{init_param.outputs[0]};
-
-    // Each input should be interpreted as follows:
-    //
-    //  0 -> Input Tensor Index
-    OperandIndexSequence inputs{init_param.inputs[0]};
-
-    return new operation::Logistic{inputs, outputs};
-  };
+  _map[ANEURALNETWORKS_LOGISTIC] = CreateSimpleUnaryOp<operation::Logistic>;
 
   _map[ANEURALNETWORKS_DIV] = [](const OperationFactory::Param &init_param, Operands &operands) {
     assert(init_param.input_count == 3 && init_param.output_count == 1);
@@ -784,36 +768,16 @@ OperationFactory::OperationFactory()
     return new operation::Div{inputs, outputs, param};
   };
 
-  _map[ANEURALNETWORKS_EXP] = [](const OperationFactory::Param &init_param, Operands &) {
-    assert(init_param.input_count == 1 && init_param.output_count == 1);
-
-    OperandIndexSequence outputs{init_param.outputs[0]};
-
-    // Each input should be interpreted as follows:
-    //
-    //  0 -> Input Tensor Index
-    OperandIndexSequence inputs{init_param.inputs[0]};
-
-    return new operation::Exp{inputs, outputs};
-  };
+  _map[ANEURALNETWORKS_EXP] = CreateSimpleUnaryOp<operation::Exp>;
 
   // ANEURALNETWORKS_EXP_EX is deprecated
   // TODO Remove ANEURALNETWORKS_EXP_EX
   _map[ANEURALNETWORKS_EXP_EX] = _map[ANEURALNETWORKS_EXP];
 
-  _map[ANEURALNETWORKS_EXPAND_DIMS] = [](const OperationFactory::Param &init_param, Operands &) {
-    assert(init_param.input_count == 2 && init_param.output_count == 1);
-
-    OperandIndexSequence outputs{init_param.outputs[0]};
-
-    // Each input should be interpreted as follows:
-    //
-    //  0 -> Input Tensor Index
-    //  1 -> Axis Tensor Index
-    OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]};
-
-    return new operation::ExpandDims{inputs, outputs};
-  };
+  // Each input should be interpreted as follows:
+  //  0 -> Input Tensor Index
+  //  1 -> Axis Tensor Index
+  _map[ANEURALNETWORKS_EXPAND_DIMS] = createSimpleBinaryOp<operation::ExpandDims>;
 
   _map[ANEURALNETWORKS_GREATER] = [](const OperationFactory::Param &init_param, Operands &) {
     assert(init_param.input_count == 2 && init_param.output_count == 1);
@@ -982,19 +946,7 @@ OperationFactory::OperationFactory()
     return new operation::Comparison{inputs, outputs, param};
   };
 
-  _map[ANEURALNETWORKS_LOGICAL_AND] = [](const OperationFactory::Param &init_param, Operands &) {
-    assert(init_param.input_count == 2 && init_param.output_count == 1);
-
-    OperandIndexSequence outputs{init_param.outputs[0]};
-
-    // Each input should be interpreted as follows:
-    //
-    //  0 -> input0 Tensor Index
-    //  1 -> input1 Tensor Index
-    OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]};
-
-    return new operation::LogicalAnd{inputs, outputs};
-  };
+  _map[ANEURALNETWORKS_LOGICAL_AND] = createSimpleBinaryOp<operation::LogicalAnd>;
 
   // ANEURALNETWORKS_LOGICAL_AND_EX is deprecated
   // TODO Remove ANEURALNETWORKS_LOGICAL_AND_EX
@@ -1018,18 +970,7 @@ OperationFactory::OperationFactory()
     return new operation::LogicalAnd{inputs, outputs};
   };
 
-  _map[ANEURALNETWORKS_RSQRT] = [](const OperationFactory::Param &init_param, Operands &) {
-    assert(init_param.input_count == 1 && init_param.output_count == 1);
-
-    OperandIndexSequence outputs{init_param.outputs[0]};
-
-    // Each input should be interpreted as follows:
-    //
-    //  0 -> Input Tensor Index
-    OperandIndexSequence inputs{init_param.inputs[0]};
-
-    return new operation::RSQRT{inputs, outputs};
-  };
+  _map[ANEURALNETWORKS_RSQRT] = CreateSimpleUnaryOp<operation::RSQRT>;
 
   _map[ANEURALNETWORKS_SELECT] = [](const OperationFactory::Param &init_param, Operands &) {
     assert(init_param.input_count == 3 && init_param.output_count == 1);
@@ -1065,18 +1006,7 @@ OperationFactory::OperationFactory()
   // TODO Remove ANEURALNETWORKS_RSQRT_EX
   _map[ANEURALNETWORKS_RSQRT_EX] = _map[ANEURALNETWORKS_RSQRT];
 
-  _map[ANEURALNETWORKS_RELU] = [](const OperationFactory::Param &init_param, Operands &) {
-    assert(init_param.input_count == 1 && init_param.output_count == 1);
-
-    OperandIndexSequence outputs{init_param.outputs[0]};
-
-    // Each input should be interpreted as follows:
-    //
-    //  0 -> Input Tensor Index
-    OperandIndexSequence inputs{init_param.inputs[0]};
-
-    return new operation::ReLU{inputs, outputs};
-  };
+  _map[ANEURALNETWORKS_RELU] = CreateSimpleUnaryOp<operation::ReLU>;
 
   _map[ANEURALNETWORKS_RESIZE_BILINEAR] = [](const OperationFactory::Param &init_param,
                                              Operands &operands) {
@@ -1098,31 +1028,9 @@ OperationFactory::OperationFactory()
     return new operation::ResizeBilinear{inputs, outputs, param};
   };
 
-  _map[ANEURALNETWORKS_RELU1] = [](const OperationFactory::Param &init_param, Operands &) {
-    assert(init_param.input_count == 1 && init_param.output_count == 1);
-
-    OperandIndexSequence outputs{init_param.outputs[0]};
-
-    // Each input should be interpreted as follows:
-    //
-    //  0 -> input Tensor Index
-    OperandIndexSequence inputs{init_param.inputs[0]};
+  _map[ANEURALNETWORKS_RELU1] = CreateSimpleUnaryOp<operation::ReLU1>;
 
-    return new operation::ReLU1{inputs, outputs};
-  };
-
-  _map[ANEURALNETWORKS_RELU6] = [](const OperationFactory::Param &init_param, Operands &) {
-    assert(init_param.input_count == 1 && init_param.output_count == 1);
-
-    OperandIndexSequence outputs{init_param.outputs[0]};
-
-    // Each input should be interpreted as follows:
-    //
-    //  0 -> input Tensor Index
-    OperandIndexSequence inputs{init_param.inputs[0]};
-
-    return new operation::ReLU6{inputs, outputs};
-  };
+  _map[ANEURALNETWORKS_RELU6] = CreateSimpleUnaryOp<operation::ReLU6>;
 
   _map[ANEURALNETWORKS_REVERSE_EX] = [](const OperationFactory::Param &init_param, Operands &) {
     assert(init_param.input_count == 2 && init_param.output_count == 1);
@@ -1438,18 +1346,7 @@ OperationFactory::OperationFactory()
     return new operation::LogicalOr{inputs, outputs};
   };
 
-  _map[ANEURALNETWORKS_LOGICAL_NOT] = [](const OperationFactory::Param &init_param, Operands &) {
-    assert(init_param.input_count == 1 && init_param.output_count == 1);
-
-    OperandIndexSequence outputs{init_param.outputs[0]};
-
-    // Each input should be interpreted as follows:
-    //
-    //  0 -> input Tensor Index
-    OperandIndexSequence inputs{init_param.inputs[0]};
-
-    return new operation::LogicalNot{inputs, outputs};
-  };
+  _map[ANEURALNETWORKS_LOGICAL_NOT] = CreateSimpleUnaryOp<operation::LogicalNot>;
 
   // ANEURALNETWORKS_LOGICAL_NOT_EX is deprecated
   // TODO Remove ANEURALNETWORKS_LOGICAL_NOT_EX
@@ -1649,35 +1546,13 @@ OperationFactory::OperationFactory()
   // TODO Remove ANEURALNETWORKS_GATHER_EX
   _map[ANEURALNETWORKS_GATHER_EX] = _map[ANEURALNETWORKS_GATHER];
 
-  _map[ANEURALNETWORKS_NEG] = [](const OperationFactory::Param &init_param, Operands &) {
-    assert(init_param.input_count == 1 && init_param.output_count == 1);
-
-    OperandIndexSequence outputs{init_param.outputs[0]};
-
-    // Each input should be interpreted as follows:
-    //
-    //  0 -> Input Tensor Index
-    OperandIndexSequence inputs{init_param.inputs[0]};
-
-    return new operation::Neg{inputs, outputs};
-  };
+  _map[ANEURALNETWORKS_NEG] = CreateSimpleUnaryOp<operation::Neg>;
 
   // ANEURALNETWORKS_NEG_EX is deprecated
   // TODO Remove ANEURALNETWORKS_NEG_EX
   _map[ANEURALNETWORKS_NEG_EX] = _map[ANEURALNETWORKS_NEG];
 
-  _map[ANEURALNETWORKS_ABS] = [](const OperationFactory::Param &init_param, Operands &) {
-    assert(init_param.input_count == 1 && init_param.output_count == 1);
-
-    OperandIndexSequence outputs{init_param.outputs[0]};
-
-    // Each input should be interpreted as follows:
-    //
-    //  0 -> Input Tensor Index
-    OperandIndexSequence inputs{init_param.inputs[0]};
-
-    return new operation::Abs{inputs, outputs};
-  };
+  _map[ANEURALNETWORKS_ABS] = CreateSimpleUnaryOp<operation::Abs>;
 
   // ANEURALNETWORKS_ABS_EX is deprecated
   // TODO Remove ANEURALNETWORKS_ABS_EX
@@ -1704,18 +1579,7 @@ OperationFactory::OperationFactory()
   // TODO Remove ANEURALNETWORKS_ARGMAX_EX
   _map[ANEURALNETWORKS_ARGMAX_EX] = _map[ANEURALNETWORKS_ARGMAX];
 
-  _map[ANEURALNETWORKS_DEQUANTIZE] = [](const OperationFactory::Param &init_param, Operands &) {
-    assert(init_param.input_count == 1 && init_param.output_count == 1);
-
-    OperandIndexSequence outputs{init_param.outputs[0]};
-
-    // Each input should be interpreted as follows:
-    //
-    //  0 -> Input Tensor Index
-    OperandIndexSequence inputs{init_param.inputs[0]};
-
-    return new operation::Dequantize{inputs, outputs};
-  };
+  _map[ANEURALNETWORKS_DEQUANTIZE] = CreateSimpleUnaryOp<operation::Dequantize>;
 
   _map[ANEURALNETWORKS_MEAN] = [](const OperationFactory::Param &init_param, Operands &operands) {
     assert(init_param.input_count == 3 && init_param.output_count == 1);
@@ -1841,31 +1705,24 @@ OperationFactory::OperationFactory()
   };
 
   _map[ANEURALNETWORKS_PAD] = [](const OperationFactory::Param &init_param, Operands &) {
-    assert(init_param.input_count == 2 && init_param.output_count >= 1);
+    assert(init_param.input_count >= 2 && init_param.input_count <= 3 &&
+           init_param.output_count >= 1);
 
     OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]};
+    if (init_param.input_count == 3)
+    {
+      inputs.append(OperandIndex{init_param.inputs[2]});
+    }
     OperandIndexSequence outputs{init_param.outputs[0]};
 
     return new operation::Pad{inputs, outputs};
   };
 
-  _map[ANEURALNETWORKS_MINIMUM] = [](const OperationFactory::Param &init_param, Operands &) {
-    assert(init_param.input_count == 2 && init_param.output_count == 1);
+  _map[ANEURALNETWORKS_PAD_V2] = _map[ANEURALNETWORKS_PAD];
 
-    OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]};
-    OperandIndexSequence outputs{init_param.outputs[0]};
+  _map[ANEURALNETWORKS_MINIMUM] = createSimpleBinaryOp<operation::Min>;
 
-    return new operation::Min{inputs, outputs};
-  };
-
-  _map[ANEURALNETWORKS_MAXIMUM] = [](const OperationFactory::Param &init_param, Operands &) {
-    assert(init_param.input_count == 2 && init_param.output_count == 1);
-
-    OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]};
-    OperandIndexSequence outputs{init_param.outputs[0]};
-
-    return new operation::Max{inputs, outputs};
-  };
+  _map[ANEURALNETWORKS_MAXIMUM] = createSimpleBinaryOp<operation::Max>;
 
   _map[ANEURALNETWORKS_ONE_HOT_EX] = [](const OperationFactory::Param &init_param,
                                         Operands &operands) {
@@ -1948,34 +1805,15 @@ OperationFactory::OperationFactory()
     return new operation::Range{inputs, outputs};
   };
 
-  _map[ANEURALNETWORKS_POW] = [](const OperationFactory::Param &init_param, Operands &) {
-    assert(init_param.input_count == 2 && init_param.output_count == 1);
+  // Each input should be interpreted as follows:
+  //  0 -> LHS Tensor Index
+  //  1 -> RHS Tensor Index
+  _map[ANEURALNETWORKS_POW] = createSimpleBinaryOp<operation::Pow>;
 
-    OperandIndexSequence outputs{init_param.outputs[0]};
-
-    // Each input should be interpreted as follows:
-    //
-    //  0 -> LHS Tensor Index
-    //  1 -> RHS Tensor Index
-
-    OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]};
-
-    return new operation::Pow{inputs, outputs};
-  };
-
-  _map[ANEURALNETWORKS_FILL_EX] = [](const OperationFactory::Param &init_param, Operands &) {
-    assert(init_param.input_count == 2 && init_param.output_count == 1);
-
-    // Each input should be interpreted as follows:
-    //
-    //  0 -> A tensor, specifying the input.
-    //  1 -> A 1-D tensor, specifying the value
-
-    OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]};
-    OperandIndexSequence outputs{init_param.outputs[0]};
-
-    return new operation::Fill{inputs, outputs};
-  };
+  // Each input should be interpreted as follows:
+  //  0 -> A tensor, specifying the input.
+  //  1 -> A 1-D tensor, specifying the value
+  _map[ANEURALNETWORKS_FILL_EX] = createSimpleBinaryOp<operation::Fill>;
 
   _map[ANEURALNETWORKS_ZEROS_LIKE_EX] = [](const OperationFactory::Param &init_param, Operands &) {
     assert(init_param.input_count == 1 && init_param.output_count == 1);
@@ -1989,20 +1827,10 @@ OperationFactory::OperationFactory()
     return new operation::ZerosLike{inputs, outputs};
   };
 
-  _map[ANEURALNETWORKS_TILE] = [](const OperationFactory::Param &init_param, Operands &) {
-    assert(init_param.input_count == 2 && init_param.output_count == 1);
-
-    OperandIndexSequence outputs{init_param.outputs[0]};
-
-    // Each input should be interpreted as follows:
-    //
-    //  0 -> Input Tensor Index
-    //  1 -> Multiple Tensor Index
-
-    OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]};
-
-    return new operation::Tile{inputs, outputs};
-  };
+  // Each input should be interpreted as follows:
+  //  0 -> Input Tensor Index
+  //  1 -> Multiple Tensor Index
+  _map[ANEURALNETWORKS_TILE] = createSimpleBinaryOp<operation::Tile>;
 
   _map[ANEURALNETWORKS_MATRIX_BAND_PART_EX] = [](const OperationFactory::Param &init_param,
                                                  Operands &) {
@@ -2064,21 +1892,9 @@ OperationFactory::OperationFactory()
     return new operation::Einsum{inputs, outputs, param};
   };
 
-  _map[ANEURALNETWORKS_BROADCAST_TO_EX] = [](const OperationFactory::Param &init_param,
-                                             Operands &) {
-    assert(init_param.input_count == 2 && init_param.output_count == 1);
-
-    OperandIndexSequence outputs{init_param.outputs[0]};
-
-    // Each input should be interpreted as follows:
-    //
-    //  0 -> Input Tensor Index
-    //  1 -> int32, int64, An 1-D int tensor Index
-
-    OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]};
-
-    return new operation::BroadcastTo{inputs, outputs};
-  };
+  //  0 -> Input Tensor Index
+  //  1 -> int32, int64, An 1-D int tensor Index
+  _map[ANEURALNETWORKS_BROADCAST_TO_EX] = createSimpleBinaryOp<operation::BroadcastTo>;
 
   _map[ANEURALNETWORKS_FUSED_BATCH_NORM_V3_EX] = [](const OperationFactory::Param &init_param,
                                                     Operands &operands) {
@@ -2133,6 +1949,15 @@ OperationFactory::OperationFactory()
 
     return new operation::LogSoftmax{inputs, outputs, param};
   };
+
+  _map[ANEURALNETWORKS_QUANTIZE] = [](const OperationFactory::Param &init_param, Operands &) {
+    assert(init_param.input_count == 1 && init_param.output_count == 1);
+
+    OperandIndexSequence inputs{init_param.inputs[0]};
+    OperandIndexSequence outputs{init_param.outputs[0]};
+
+    return new operation::Quantize{inputs, outputs};
+  };
 }
 
 Operation *OperationFactory::create(ANeuralNetworksOperationType type,
diff --git a/runtime/onert/test/core/exec/ExecInstance.cc b/runtime/onert/test/core/exec/ExecInstance.cc
index cc04347..0fcf372 100644
--- a/runtime/onert/test/core/exec/ExecInstance.cc
+++ b/runtime/onert/test/core/exec/ExecInstance.cc
@@ -73,9 +73,8 @@ public:
     // Compile
     auto subgs = std::make_shared<onert::ir::Subgraphs>();
     subgs->push(onert::ir::SubgraphIndex{0}, graph);
-    auto compiler = new onert::compiler::Compiler{subgs};
-    executors = compiler->compile();
-    delete compiler;
+    onert::compiler::Compiler compiler{subgs};
+    executors = compiler.compile();
   }
 
 public:
@@ -98,19 +97,17 @@ TEST(ExecInstance, simple)
   float output_buffer[4] = {};
   const float output_expected[4] = {5, -2, 0, -1};
 
-  auto execution = new onert::exec::Execution(executors);
+  onert::exec::Execution execution{executors};
 
-  execution->setInput(input1, reinterpret_cast<const void *>(input1_buffer), 16);
-  execution->setInput(input2, reinterpret_cast<const void *>(input2_buffer), 16);
-  execution->setOutput(output, reinterpret_cast<void *>(output_buffer), 16);
-  execution->execute();
+  execution.setInput(input1, reinterpret_cast<const void *>(input1_buffer), 16);
+  execution.setInput(input2, reinterpret_cast<const void *>(input2_buffer), 16);
+  execution.setOutput(output, reinterpret_cast<void *>(output_buffer), 16);
+  execution.execute();
 
   for (auto i = 0; i < 4; i++)
   {
     EXPECT_EQ(output_buffer[i], output_expected[i]);
   }
-
-  delete execution;
 }
 
 TEST(ExecInstance, twoCompile)
@@ -118,7 +115,7 @@ TEST(ExecInstance, twoCompile)
   auto mockup = CompiledMockUpModel();
   auto graph = mockup.graph;
   auto executors1 = mockup.executors;
-  auto execution1 = new onert::exec::Execution(executors1);
+  onert::exec::Execution execution1{executors1};
 
   auto input1 = IOIndex{0};
   auto input2 = IOIndex{1};
@@ -129,38 +126,34 @@ TEST(ExecInstance, twoCompile)
   float exe1_output_buffer[4] = {};
   const float exe1_output_expected[4] = {5, -2, 0, -1};
 
-  execution1->setInput(input1, reinterpret_cast<const void *>(exe1_input1_buffer), 16);
-  execution1->setInput(input2, reinterpret_cast<const void *>(exe1_input2_buffer), 16);
-  execution1->setOutput(output, reinterpret_cast<void *>(exe1_output_buffer), 16);
+  execution1.setInput(input1, reinterpret_cast<const void *>(exe1_input1_buffer), 16);
+  execution1.setInput(input2, reinterpret_cast<const void *>(exe1_input2_buffer), 16);
+  execution1.setOutput(output, reinterpret_cast<void *>(exe1_output_buffer), 16);
 
   // Make new executor: compile again
   auto subgs = std::make_shared<onert::ir::Subgraphs>();
   subgs->push(onert::ir::SubgraphIndex{0}, graph);
-  auto compiler = new onert::compiler::Compiler{subgs};
-  std::shared_ptr<onert::exec::ExecutorMap> executors2 = compiler->compile();
-  auto execution2 = new onert::exec::Execution(executors2);
+  onert::compiler::Compiler compiler{subgs};
+  std::shared_ptr<onert::exec::ExecutorMap> executors2 = compiler.compile();
+  onert::exec::Execution execution2{executors2};
 
   const float exe2_input1_buffer[4] = {2, 1, -2, 0};
   const float exe2_input2_buffer[4] = {-3, 3, 1, 2};
   float exe2_output_buffer[4] = {};
   const float exe2_output_expected[4] = {2, 5, -2, 7};
 
-  execution2->setInput(input1, reinterpret_cast<const void *>(exe2_input1_buffer), 16);
-  execution2->setInput(input2, reinterpret_cast<const void *>(exe2_input2_buffer), 16);
-  execution2->setOutput(output, reinterpret_cast<void *>(exe2_output_buffer), 16);
+  execution2.setInput(input1, reinterpret_cast<const void *>(exe2_input1_buffer), 16);
+  execution2.setInput(input2, reinterpret_cast<const void *>(exe2_input2_buffer), 16);
+  execution2.setOutput(output, reinterpret_cast<void *>(exe2_output_buffer), 16);
 
-  execution1->execute();
-  execution2->execute();
+  execution1.execute();
+  execution2.execute();
 
   for (auto i = 0; i < 4; i++)
   {
     EXPECT_EQ(exe1_output_buffer[i], exe1_output_expected[i]);
     EXPECT_EQ(exe2_output_buffer[i], exe2_output_expected[i]);
   }
-
-  delete compiler;
-  delete execution1;
-  delete execution2;
 }
 
 // Support two initialized execution instance then ordered execution
@@ -178,32 +171,29 @@ TEST(ExecInstance, twoExecution)
   const float exe1_output_expected[4] = {5, -2, 0, -1};
   const float exe2_output_expected[4] = {2, 5, -2, 7};
 
-  auto execution1 = new onert::exec::Execution(executors);
-  execution1->setInput(input1, reinterpret_cast<const void *>(exe1_input1_buffer), 16);
-  execution1->setInput(input2, reinterpret_cast<const void *>(exe1_input2_buffer), 16);
-  execution1->setOutput(output1, reinterpret_cast<void *>(exe1_output_buffer), 16);
+  onert::exec::Execution execution1{executors};
+  execution1.setInput(input1, reinterpret_cast<const void *>(exe1_input1_buffer), 16);
+  execution1.setInput(input2, reinterpret_cast<const void *>(exe1_input2_buffer), 16);
+  execution1.setOutput(output1, reinterpret_cast<void *>(exe1_output_buffer), 16);
 
   const float exe2_input1_buffer[4] = {2, 1, -2, 0};
   const float exe2_input2_buffer[4] = {-3, 3, 1, 2};
   float exe2_output_buffer[4] = {};
 
   // Make new execution
-  auto execution2 = new onert::exec::Execution(executors);
-  execution2->setInput(input1, reinterpret_cast<const void *>(exe2_input1_buffer), 16);
-  execution2->setInput(input2, reinterpret_cast<const void *>(exe2_input2_buffer), 16);
-  execution2->setOutput(output1, reinterpret_cast<void *>(exe2_output_buffer), 16);
+  onert::exec::Execution execution2{executors};
+  execution2.setInput(input1, reinterpret_cast<const void *>(exe2_input1_buffer), 16);
+  execution2.setInput(input2, reinterpret_cast<const void *>(exe2_input2_buffer), 16);
+  execution2.setOutput(output1, reinterpret_cast<void *>(exe2_output_buffer), 16);
 
-  execution1->execute();
-  execution2->execute();
+  execution1.execute();
+  execution2.execute();
 
   for (auto i = 0; i < 4; i++)
   {
     EXPECT_EQ(exe1_output_buffer[i], exe1_output_expected[i]);
     EXPECT_EQ(exe2_output_buffer[i], exe2_output_expected[i]);
   }
-
-  delete execution1;
-  delete execution2;
 }
 
 class Inference
@@ -222,14 +212,12 @@ public:
     auto input2 = IOIndex{1};
     auto output1 = IOIndex{0};
 
-    auto execution = new onert::exec::Execution(_executors);
-    execution->setInput(input1, reinterpret_cast<const void *>(_input1), 16);
-    execution->setInput(input2, reinterpret_cast<const void *>(_input2), 16);
-    execution->setOutput(output1, reinterpret_cast<void *>(_output), 16);
+    onert::exec::Execution execution{_executors};
+    execution.setInput(input1, reinterpret_cast<const void *>(_input1), 16);
+    execution.setInput(input2, reinterpret_cast<const void *>(_input2), 16);
+    execution.setOutput(output1, reinterpret_cast<void *>(_output), 16);
 
-    execution->execute();
-
-    delete execution;
+    execution.execute();
   }
 
 private:
@@ -288,20 +276,18 @@ TEST(ExecInstance, async)
   float output_buffer[4] = {};
   const float output_expected[4] = {5, -2, 0, -1};
 
-  auto execution = new onert::exec::Execution(executors);
+  onert::exec::Execution execution{executors};
 
-  execution->setInput(input1, reinterpret_cast<const void *>(input1_buffer), 16);
-  execution->setInput(input2, reinterpret_cast<const void *>(input2_buffer), 16);
-  execution->setOutput(output, reinterpret_cast<void *>(output_buffer), 16);
-  execution->startExecute();
-  execution->waitFinish();
+  execution.setInput(input1, reinterpret_cast<const void *>(input1_buffer), 16);
+  execution.setInput(input2, reinterpret_cast<const void *>(input2_buffer), 16);
+  execution.setOutput(output, reinterpret_cast<void *>(output_buffer), 16);
+  execution.startExecute();
+  execution.waitFinish();
 
   for (auto i = 0; i < 4; i++)
   {
     EXPECT_EQ(output_buffer[i], output_expected[i]);
   }
-
-  delete execution;
 }
 
 } // namespace
diff --git a/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_cl b/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_cl
index e50b941..005f61c 100644
--- a/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_cl
+++ b/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_cl
@@ -23,8 +23,8 @@ GeneratedTests.cast_float16_to_quant8
 GeneratedTests.cast_float16_to_quant8_overflow
 GeneratedTests.cast_float32_to_float16
 GeneratedTests.cast_float32_to_float16_relaxed
+GeneratedTests.cast_float32_to_int32_nnfw
 GeneratedTests.cast_int32_to_float16
-GeneratedTests.cast_int32_to_quant8_overflow
 GeneratedTests.cast_quant8_to_float16
 GeneratedTests.concat_dynamic_nnfw
 GeneratedTests.conv_dynamic_nnfw
@@ -68,6 +68,7 @@ GeneratedTests.gather_float16_7
 GeneratedTests.gather_float16_8
 GeneratedTests.greater_dynamic_float_nnfw
 GeneratedTests.greater_equal_dynamic_float_nnfw
+GeneratedTests.l2_normalization_quant8_nnfw
 GeneratedTests.less_dynamic_float_nnfw
 GeneratedTests.less_equal_dynamic_float_nnfw
 GeneratedTests.log_4D_float_nnfw
@@ -106,11 +107,26 @@ GeneratedTests.not_equal_dynamic_float_nnfw
 GeneratedTests.one_hot_ex_dynamic_nnfw
 GeneratedTests.pack_ex_dynamic_nnfw
 GeneratedTests.pad_dynamic_nnfw
+GeneratedTests.pad_v2_1_float
+GeneratedTests.pad_v2_1_quant8
+GeneratedTests.pad_v2_all_dims
+GeneratedTests.pad_v2_all_dims_quant8
+GeneratedTests.pad_v2_low_rank
+GeneratedTests.pad_v2_low_rank_quant8
 GeneratedTests.pow_2D_float_nnfw
 GeneratedTests.pow_broadcast_float_nnfw
 GeneratedTests.pow_broadcast_float_nnfw_2
 GeneratedTests.pow_broadcast_float_nnfw_3
 GeneratedTests.pow_dynamic_nnfw
+GeneratedTests.quantize_quant8
+GeneratedTests.quantize_quant8_2
+GeneratedTests.quantize_quant8_3
+GeneratedTests.quantize_quant8_4
+GeneratedTests.quantize_quant8_5
+GeneratedTests.quantize_quant8_6
+GeneratedTests.quantize_quant8_7
+GeneratedTests.quantize_quant8_8
+GeneratedTests.quantize_zero_sized
 GeneratedTests.range_ex_float_1
 GeneratedTests.range_ex_float_1_all_constant_inputs
 GeneratedTests.range_ex_float_1_dynamic_nnfw
diff --git a/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_neon b/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_neon
index c9edee5..d987bf1 100644
--- a/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_neon
+++ b/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_neon
@@ -23,10 +23,7 @@ GeneratedTests.cast_float16_to_quant8
 GeneratedTests.cast_float16_to_quant8_overflow
 GeneratedTests.cast_float32_to_float16
 GeneratedTests.cast_float32_to_float16_relaxed
-GeneratedTests.cast_float32_to_quant8_overflow
-GeneratedTests.cast_float32_to_quant8_overflow_relaxed
 GeneratedTests.cast_int32_to_float16
-GeneratedTests.cast_int32_to_quant8_overflow
 GeneratedTests.cast_quant8_to_float16
 GeneratedTests.concat_dynamic_nnfw
 GeneratedTests.conv_dynamic_nnfw
@@ -73,6 +70,7 @@ GeneratedTests.gather_float16_8
 GeneratedTests.greater_dynamic_float_nnfw
 GeneratedTests.greater_equal_boolean
 GeneratedTests.greater_equal_dynamic_float_nnfw
+GeneratedTests.l2_normalization_quant8_nnfw
 GeneratedTests.less_boolean
 GeneratedTests.less_dynamic_float_nnfw
 GeneratedTests.less_equal_dynamic_float_nnfw
@@ -112,11 +110,26 @@ GeneratedTests.not_equal_dynamic_float_nnfw
 GeneratedTests.one_hot_ex_dynamic_nnfw
 GeneratedTests.pack_ex_dynamic_nnfw
 GeneratedTests.pad_dynamic_nnfw
+GeneratedTests.pad_v2_1_float
+GeneratedTests.pad_v2_1_quant8
+GeneratedTests.pad_v2_all_dims
+GeneratedTests.pad_v2_all_dims_quant8
+GeneratedTests.pad_v2_low_rank
+GeneratedTests.pad_v2_low_rank_quant8
 GeneratedTests.pow_2D_float_nnfw
 GeneratedTests.pow_broadcast_float_nnfw
 GeneratedTests.pow_broadcast_float_nnfw_2
 GeneratedTests.pow_broadcast_float_nnfw_3
 GeneratedTests.pow_dynamic_nnfw
+GeneratedTests.quantize_quant8
+GeneratedTests.quantize_quant8_2
+GeneratedTests.quantize_quant8_3
+GeneratedTests.quantize_quant8_4
+GeneratedTests.quantize_quant8_5
+GeneratedTests.quantize_quant8_6
+GeneratedTests.quantize_quant8_7
+GeneratedTests.quantize_quant8_8
+GeneratedTests.quantize_zero_sized
 GeneratedTests.range_ex_float_1
 GeneratedTests.range_ex_float_1_all_constant_inputs
 GeneratedTests.range_ex_float_1_dynamic_nnfw
diff --git a/tests/nnapi/nnapi_gtest.skip.aarch64-linux.cpu b/tests/nnapi/nnapi_gtest.skip.aarch64-linux.cpu
index 3cce4f3..bc0ae0f 100644
--- a/tests/nnapi/nnapi_gtest.skip.aarch64-linux.cpu
+++ b/tests/nnapi/nnapi_gtest.skip.aarch64-linux.cpu
@@ -38,9 +38,6 @@ GeneratedTests.gather_float16_8
 GeneratedTests.hashtable_lookup_float
 GeneratedTests.hashtable_lookup_float_4D_nnfw
 GeneratedTests.hashtable_lookup_quant8
-GeneratedTests.l2_normalization
-GeneratedTests.l2_normalization_2
-GeneratedTests.l2_normalization_large
 GeneratedTests.l2_pool_float
 GeneratedTests.l2_pool_float_2
 GeneratedTests.l2_pool_float_large
@@ -79,7 +76,6 @@ GeneratedTests.minimum_simple_quant8
 GeneratedTests.neg
 GeneratedTests.neg_3D_int_nnfw
 GeneratedTests.neg_4D_int_nnfw
-GeneratedTests.pad_quant8_nnfw
 GeneratedTests.prelu
 GeneratedTests.prelu_broadcast_float_1_nnfw
 GeneratedTests.prelu_broadcast_quant8_1_nnfw
@@ -94,6 +90,11 @@ GeneratedTests.prelu_weight_as_input_quant8
 GeneratedTests.prelu_weight_as_input_quant8_2
 GeneratedTests.prelu_weight_as_input_quant8_3
 GeneratedTests.prelu_weight_as_input_quant8_4
+GeneratedTests.quantize_quant8_5
+GeneratedTests.quantize_quant8_6
+GeneratedTests.quantize_quant8_7
+GeneratedTests.quantize_quant8_8
+GeneratedTests.quantize_zero_sized
 GeneratedTests.reduce_max_quant8
 GeneratedTests.reduce_max_quant8_1_nnfw
 GeneratedTests.reduce_max_quant8_2
@@ -125,13 +126,9 @@ GeneratedTests.select_v1_2_one_dim_quant8
 GeneratedTests.select_v1_2_two_dim_quant8
 GeneratedTests.slice_5
 GeneratedTests.slice_6
-GeneratedTests.slice_7
 GeneratedTests.slice_8
 GeneratedTests.slice_zero_sized
 GeneratedTests.slice_zero_sized_quant8
-GeneratedTests.space_to_depth_float_1
-GeneratedTests.space_to_depth_float_2
-GeneratedTests.space_to_depth_float_3
 GeneratedTests.space_to_depth_quant8_1
 GeneratedTests.space_to_depth_quant8_2
 GeneratedTests.sqrt_
diff --git a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_cl b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_cl
index e50b941..005f61c 100644
--- a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_cl
+++ b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_cl
@@ -23,8 +23,8 @@ GeneratedTests.cast_float16_to_quant8
 GeneratedTests.cast_float16_to_quant8_overflow
 GeneratedTests.cast_float32_to_float16
 GeneratedTests.cast_float32_to_float16_relaxed
+GeneratedTests.cast_float32_to_int32_nnfw
 GeneratedTests.cast_int32_to_float16
-GeneratedTests.cast_int32_to_quant8_overflow
 GeneratedTests.cast_quant8_to_float16
 GeneratedTests.concat_dynamic_nnfw
 GeneratedTests.conv_dynamic_nnfw
@@ -68,6 +68,7 @@ GeneratedTests.gather_float16_7
 GeneratedTests.gather_float16_8
 GeneratedTests.greater_dynamic_float_nnfw
 GeneratedTests.greater_equal_dynamic_float_nnfw
+GeneratedTests.l2_normalization_quant8_nnfw
 GeneratedTests.less_dynamic_float_nnfw
 GeneratedTests.less_equal_dynamic_float_nnfw
 GeneratedTests.log_4D_float_nnfw
@@ -106,11 +107,26 @@ GeneratedTests.not_equal_dynamic_float_nnfw
 GeneratedTests.one_hot_ex_dynamic_nnfw
 GeneratedTests.pack_ex_dynamic_nnfw
 GeneratedTests.pad_dynamic_nnfw
+GeneratedTests.pad_v2_1_float
+GeneratedTests.pad_v2_1_quant8
+GeneratedTests.pad_v2_all_dims
+GeneratedTests.pad_v2_all_dims_quant8
+GeneratedTests.pad_v2_low_rank
+GeneratedTests.pad_v2_low_rank_quant8
 GeneratedTests.pow_2D_float_nnfw
 GeneratedTests.pow_broadcast_float_nnfw
 GeneratedTests.pow_broadcast_float_nnfw_2
 GeneratedTests.pow_broadcast_float_nnfw_3
 GeneratedTests.pow_dynamic_nnfw
+GeneratedTests.quantize_quant8
+GeneratedTests.quantize_quant8_2
+GeneratedTests.quantize_quant8_3
+GeneratedTests.quantize_quant8_4
+GeneratedTests.quantize_quant8_5
+GeneratedTests.quantize_quant8_6
+GeneratedTests.quantize_quant8_7
+GeneratedTests.quantize_quant8_8
+GeneratedTests.quantize_zero_sized
 GeneratedTests.range_ex_float_1
 GeneratedTests.range_ex_float_1_all_constant_inputs
 GeneratedTests.range_ex_float_1_dynamic_nnfw
diff --git a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon
index 55cfe39..051fbc7 100644
--- a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon
+++ b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon
@@ -23,10 +23,7 @@ GeneratedTests.cast_float16_to_quant8
 GeneratedTests.cast_float16_to_quant8_overflow
 GeneratedTests.cast_float32_to_float16
 GeneratedTests.cast_float32_to_float16_relaxed
-GeneratedTests.cast_float32_to_quant8_overflow
-GeneratedTests.cast_float32_to_quant8_overflow_relaxed
 GeneratedTests.cast_int32_to_float16
-GeneratedTests.cast_int32_to_quant8_overflow
 GeneratedTests.cast_quant8_to_float16
 GeneratedTests.concat_dynamic_nnfw
 GeneratedTests.conv_dynamic_nnfw
@@ -73,6 +70,7 @@ GeneratedTests.greater_dynamic_float_nnfw
 GeneratedTests.greater_equal_boolean
 GeneratedTests.greater_equal_dynamic_float_nnfw
 GeneratedTests.less_boolean
+GeneratedTests.l2_normalization_quant8_nnfw
 GeneratedTests.less_dynamic_float_nnfw
 GeneratedTests.less_equal_dynamic_float_nnfw
 GeneratedTests.log_4D_float_nnfw
@@ -111,11 +109,26 @@ GeneratedTests.not_equal_dynamic_float_nnfw
 GeneratedTests.one_hot_ex_dynamic_nnfw
 GeneratedTests.pack_ex_dynamic_nnfw
 GeneratedTests.pad_dynamic_nnfw
+GeneratedTests.pad_v2_1_float
+GeneratedTests.pad_v2_1_quant8
+GeneratedTests.pad_v2_all_dims
+GeneratedTests.pad_v2_all_dims_quant8
+GeneratedTests.pad_v2_low_rank
+GeneratedTests.pad_v2_low_rank_quant8
 GeneratedTests.pow_2D_float_nnfw
 GeneratedTests.pow_broadcast_float_nnfw
 GeneratedTests.pow_broadcast_float_nnfw_2
 GeneratedTests.pow_broadcast_float_nnfw_3
 GeneratedTests.pow_dynamic_nnfw
+GeneratedTests.quantize_quant8
+GeneratedTests.quantize_quant8_2
+GeneratedTests.quantize_quant8_3
+GeneratedTests.quantize_quant8_4
+GeneratedTests.quantize_quant8_5
+GeneratedTests.quantize_quant8_6
+GeneratedTests.quantize_quant8_7
+GeneratedTests.quantize_quant8_8
+GeneratedTests.quantize_zero_sized
 GeneratedTests.range_ex_float_1
 GeneratedTests.range_ex_float_1_all_constant_inputs
 GeneratedTests.range_ex_float_1_dynamic_nnfw
diff --git a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.cpu b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.cpu
index 3cce4f3..bc0ae0f 100644
--- a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.cpu
+++ b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.cpu
@@ -38,9 +38,6 @@ GeneratedTests.gather_float16_8
 GeneratedTests.hashtable_lookup_float
 GeneratedTests.hashtable_lookup_float_4D_nnfw
 GeneratedTests.hashtable_lookup_quant8
-GeneratedTests.l2_normalization
-GeneratedTests.l2_normalization_2
-GeneratedTests.l2_normalization_large
 GeneratedTests.l2_pool_float
 GeneratedTests.l2_pool_float_2
 GeneratedTests.l2_pool_float_large
@@ -79,7 +76,6 @@ GeneratedTests.minimum_simple_quant8
 GeneratedTests.neg
 GeneratedTests.neg_3D_int_nnfw
 GeneratedTests.neg_4D_int_nnfw
-GeneratedTests.pad_quant8_nnfw
 GeneratedTests.prelu
 GeneratedTests.prelu_broadcast_float_1_nnfw
 GeneratedTests.prelu_broadcast_quant8_1_nnfw
@@ -94,6 +90,11 @@ GeneratedTests.prelu_weight_as_input_quant8
 GeneratedTests.prelu_weight_as_input_quant8_2
 GeneratedTests.prelu_weight_as_input_quant8_3
 GeneratedTests.prelu_weight_as_input_quant8_4
+GeneratedTests.quantize_quant8_5
+GeneratedTests.quantize_quant8_6
+GeneratedTests.quantize_quant8_7
+GeneratedTests.quantize_quant8_8
+GeneratedTests.quantize_zero_sized
 GeneratedTests.reduce_max_quant8
 GeneratedTests.reduce_max_quant8_1_nnfw
 GeneratedTests.reduce_max_quant8_2
@@ -125,13 +126,9 @@ GeneratedTests.select_v1_2_one_dim_quant8
 GeneratedTests.select_v1_2_two_dim_quant8
 GeneratedTests.slice_5
 GeneratedTests.slice_6
-GeneratedTests.slice_7
 GeneratedTests.slice_8
 GeneratedTests.slice_zero_sized
 GeneratedTests.slice_zero_sized_quant8
-GeneratedTests.space_to_depth_float_1
-GeneratedTests.space_to_depth_float_2
-GeneratedTests.space_to_depth_float_3
 GeneratedTests.space_to_depth_quant8_1
 GeneratedTests.space_to_depth_quant8_2
 GeneratedTests.sqrt_
diff --git a/tests/nnapi/nnapi_gtest.skip.noarch.interp b/tests/nnapi/nnapi_gtest.skip.noarch.interp
index 08118ca..069d367 100644
--- a/tests/nnapi/nnapi_gtest.skip.noarch.interp
+++ b/tests/nnapi/nnapi_gtest.skip.noarch.interp
@@ -188,6 +188,7 @@ GeneratedTests.hashtable_lookup_quant8
 GeneratedTests.l2_normalization
 GeneratedTests.l2_normalization_2
 GeneratedTests.l2_normalization_large
+GeneratedTests.l2_normalization_quant8_nnfw
 GeneratedTests.l2_pool_float
 GeneratedTests.l2_pool_float_2
 GeneratedTests.l2_pool_float_large
@@ -312,6 +313,12 @@ GeneratedTests.pack_ex_2D_int_2
 GeneratedTests.pack_ex_dynamic_nnfw
 GeneratedTests.pad_dynamic_nnfw
 GeneratedTests.pad_quant8_nnfw
+GeneratedTests.pad_v2_1_float
+GeneratedTests.pad_v2_1_quant8
+GeneratedTests.pad_v2_all_dims
+GeneratedTests.pad_v2_all_dims_quant8
+GeneratedTests.pad_v2_low_rank
+GeneratedTests.pad_v2_low_rank_quant8
 GeneratedTests.pow_2D_float_nnfw
 GeneratedTests.pow_broadcast_float_nnfw
 GeneratedTests.pow_broadcast_float_nnfw_2
@@ -331,6 +338,15 @@ GeneratedTests.prelu_weight_as_input_quant8
 GeneratedTests.prelu_weight_as_input_quant8_2
 GeneratedTests.prelu_weight_as_input_quant8_3
 GeneratedTests.prelu_weight_as_input_quant8_4
+GeneratedTests.quantize_quant8
+GeneratedTests.quantize_quant8_2
+GeneratedTests.quantize_quant8_3
+GeneratedTests.quantize_quant8_4
+GeneratedTests.quantize_quant8_5
+GeneratedTests.quantize_quant8_6
+GeneratedTests.quantize_quant8_7
+GeneratedTests.quantize_quant8_8
+GeneratedTests.quantize_zero_sized
 GeneratedTests.range_ex_float_1
 GeneratedTests.range_ex_float_1_all_constant_inputs
 GeneratedTests.range_ex_float_1_dynamic_nnfw
diff --git a/tests/nnapi/nnapi_gtest.skip.x86_64-linux.cpu b/tests/nnapi/nnapi_gtest.skip.x86_64-linux.cpu
index 3cce4f3..bc0ae0f 100644
--- a/tests/nnapi/nnapi_gtest.skip.x86_64-linux.cpu
+++ b/tests/nnapi/nnapi_gtest.skip.x86_64-linux.cpu
@@ -38,9 +38,6 @@ GeneratedTests.gather_float16_8
 GeneratedTests.hashtable_lookup_float
 GeneratedTests.hashtable_lookup_float_4D_nnfw
 GeneratedTests.hashtable_lookup_quant8
-GeneratedTests.l2_normalization
-GeneratedTests.l2_normalization_2
-GeneratedTests.l2_normalization_large
 GeneratedTests.l2_pool_float
 GeneratedTests.l2_pool_float_2
 GeneratedTests.l2_pool_float_large
@@ -79,7 +76,6 @@ GeneratedTests.minimum_simple_quant8
 GeneratedTests.neg
 GeneratedTests.neg_3D_int_nnfw
 GeneratedTests.neg_4D_int_nnfw
-GeneratedTests.pad_quant8_nnfw
 GeneratedTests.prelu
 GeneratedTests.prelu_broadcast_float_1_nnfw
 GeneratedTests.prelu_broadcast_quant8_1_nnfw
@@ -94,6 +90,11 @@ GeneratedTests.prelu_weight_as_input_quant8
 GeneratedTests.prelu_weight_as_input_quant8_2
 GeneratedTests.prelu_weight_as_input_quant8_3
 GeneratedTests.prelu_weight_as_input_quant8_4
+GeneratedTests.quantize_quant8_5
+GeneratedTests.quantize_quant8_6
+GeneratedTests.quantize_quant8_7
+GeneratedTests.quantize_quant8_8
+GeneratedTests.quantize_zero_sized
 GeneratedTests.reduce_max_quant8
 GeneratedTests.reduce_max_quant8_1_nnfw
 GeneratedTests.reduce_max_quant8_2
@@ -125,13 +126,9 @@ GeneratedTests.select_v1_2_one_dim_quant8
 GeneratedTests.select_v1_2_two_dim_quant8
 GeneratedTests.slice_5
 GeneratedTests.slice_6
-GeneratedTests.slice_7
 GeneratedTests.slice_8
 GeneratedTests.slice_zero_sized
 GeneratedTests.slice_zero_sized_quant8
-GeneratedTests.space_to_depth_float_1
-GeneratedTests.space_to_depth_float_2
-GeneratedTests.space_to_depth_float_3
 GeneratedTests.space_to_depth_quant8_1
 GeneratedTests.space_to_depth_quant8_2
 GeneratedTests.sqrt_
diff --git a/tests/nnapi/specs/V1_0/l2_normalization_quant8_nnfw.mod.py b/tests/nnapi/specs/V1_0/l2_normalization_quant8_nnfw.mod.py
new file mode 100644
index 0000000..ca3770c
--- /dev/null
+++ b/tests/nnapi/specs/V1_0/l2_normalization_quant8_nnfw.mod.py
@@ -0,0 +1,30 @@
+#
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+# Copyright (C) 2017 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+model = Model()
+in0 = Input("op1", "TENSOR_QUANT8_ASYMM", "{1, 1, 1, 3}, 2e-7, 128")
+out0 = Output("op2", "TENSOR_QUANT8_ASYMM", "{1, 1, 1, 3}, 2e-7, 128")
+model = model.Operation("L2_NORMALIZATION", in0).To(out0)
+
+# Example 1. Input in operand 0,
+input0 = {in0: # input 0
+          [0, 5, 12]}
+output0 = {out0: # output 0
+               [51, 54, 58]}
+
+# Instantiate an example
+Example((input0, output0))
diff --git a/tests/nnapi/specs/skip/V1_2/pad_v2_1_float.mod.py b/tests/nnapi/specs/V1_2/pad_v2_1_float.mod.py
similarity index 100%
rename from tests/nnapi/specs/skip/V1_2/pad_v2_1_float.mod.py
rename to tests/nnapi/specs/V1_2/pad_v2_1_float.mod.py
diff --git a/tests/nnapi/specs/skip/V1_2/pad_v2_1_quant8.mod.py b/tests/nnapi/specs/V1_2/pad_v2_1_quant8.mod.py
similarity index 100%
rename from tests/nnapi/specs/skip/V1_2/pad_v2_1_quant8.mod.py
rename to tests/nnapi/specs/V1_2/pad_v2_1_quant8.mod.py
diff --git a/tests/nnapi/specs/skip/V1_2/pad_v2_all_dims.mod.py b/tests/nnapi/specs/V1_2/pad_v2_all_dims.mod.py
similarity index 100%
rename from tests/nnapi/specs/skip/V1_2/pad_v2_all_dims.mod.py
rename to tests/nnapi/specs/V1_2/pad_v2_all_dims.mod.py
diff --git a/tests/nnapi/specs/skip/V1_2/pad_v2_all_dims_quant8.mod.py b/tests/nnapi/specs/V1_2/pad_v2_all_dims_quant8.mod.py
similarity index 100%
rename from tests/nnapi/specs/skip/V1_2/pad_v2_all_dims_quant8.mod.py
rename to tests/nnapi/specs/V1_2/pad_v2_all_dims_quant8.mod.py
diff --git a/tests/nnapi/specs/skip/V1_2/pad_v2_low_rank.mod.py b/tests/nnapi/specs/V1_2/pad_v2_low_rank.mod.py
similarity index 100%
rename from tests/nnapi/specs/skip/V1_2/pad_v2_low_rank.mod.py
rename to tests/nnapi/specs/V1_2/pad_v2_low_rank.mod.py
diff --git a/tests/nnapi/specs/skip/V1_2/pad_v2_low_rank_quant8.mod.py b/tests/nnapi/specs/V1_2/pad_v2_low_rank_quant8.mod.py
similarity index 100%
rename from tests/nnapi/specs/skip/V1_2/pad_v2_low_rank_quant8.mod.py
rename to tests/nnapi/specs/V1_2/pad_v2_low_rank_quant8.mod.py
diff --git a/tests/nnapi/specs/skip/V1_2/quantize.mod.py b/tests/nnapi/specs/V1_2/quantize.mod.py
similarity index 100%
rename from tests/nnapi/specs/skip/V1_2/quantize.mod.py
rename to tests/nnapi/specs/V1_2/quantize.mod.py
diff --git a/tests/nnfw_api/src/ValidationTestAddModelLoaded.cc b/tests/nnfw_api/src/ValidationTestAddModelLoaded.cc
index 67f2467..c6c6355 100644
--- a/tests/nnfw_api/src/ValidationTestAddModelLoaded.cc
+++ b/tests/nnfw_api/src/ValidationTestAddModelLoaded.cc
@@ -51,19 +51,24 @@ TEST_F(ValidationTestAddModelLoaded, output_tensorinfo)
   ASSERT_EQ(tensor_info.dims[0], 1);
 }
 
-TEST_F(ValidationTestAddModelLoaded, neg_run_001)
+TEST_F(ValidationTestAddModelLoaded, neg_run)
 {
-  ASSERT_EQ(nnfw_run(_session), NNFW_STATUS_ERROR);
+  // nnfw_prepare is not called
+  ASSERT_EQ(nnfw_run(_session), NNFW_STATUS_INVALID_STATE);
 }
 
-TEST_F(ValidationTestAddModelLoaded, neg_set_input_001)
+TEST_F(ValidationTestAddModelLoaded, neg_set_input)
 {
-  ASSERT_EQ(nnfw_set_input(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, nullptr, 0), NNFW_STATUS_ERROR);
+  // nnfw_prepare is not called
+  ASSERT_EQ(nnfw_set_input(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, nullptr, 0),
+            NNFW_STATUS_INVALID_STATE);
 }
 
-TEST_F(ValidationTestAddModelLoaded, neg_set_output_001)
+TEST_F(ValidationTestAddModelLoaded, neg_set_output)
 {
-  ASSERT_EQ(nnfw_set_output(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, nullptr, 0), NNFW_STATUS_ERROR);
+  // nnfw_prepare is not called
+  ASSERT_EQ(nnfw_set_output(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, nullptr, 0),
+            NNFW_STATUS_INVALID_STATE);
 }
 
 TEST_F(ValidationTestAddModelLoaded, neg_get_input_size)
@@ -81,7 +86,7 @@ TEST_F(ValidationTestAddModelLoaded, neg_load_model)
   // load model twice
   ASSERT_EQ(nnfw_load_model_from_file(
                 _session, NNPackages::get().getModelAbsolutePath(NNPackages::ADD).c_str()),
-            NNFW_STATUS_ERROR);
+            NNFW_STATUS_INVALID_STATE);
 }
 
 TEST_F(ValidationTestAddModelLoaded, neg_output_tensorinfo)
diff --git a/tests/nnfw_api/src/ValidationTestAddSessionPrepared.cc b/tests/nnfw_api/src/ValidationTestAddSessionPrepared.cc
index 1bb4182..0f4a4af 100644
--- a/tests/nnfw_api/src/ValidationTestAddSessionPrepared.cc
+++ b/tests/nnfw_api/src/ValidationTestAddSessionPrepared.cc
@@ -102,7 +102,7 @@ TEST_F(ValidationTestAddSessionPrepared, neg_run_during_async_run)
 {
   SetInOutBuffers();
   ASSERT_EQ(nnfw_run_async(_session), NNFW_STATUS_NO_ERROR);
-  EXPECT_EQ(nnfw_run(_session), NNFW_STATUS_ERROR);
+  EXPECT_EQ(nnfw_run(_session), NNFW_STATUS_INVALID_STATE);
   ASSERT_EQ(nnfw_await(_session), NNFW_STATUS_NO_ERROR);
 }
 
@@ -152,13 +152,13 @@ TEST_F(ValidationTestAddSessionPrepared, neg_load_model)
   // Load model twice
   ASSERT_EQ(nnfw_load_model_from_file(
                 _session, NNPackages::get().getModelAbsolutePath(NNPackages::ADD).c_str()),
-            NNFW_STATUS_ERROR);
+            NNFW_STATUS_INVALID_STATE);
 }
 
 TEST_F(ValidationTestAddSessionPrepared, neg_prepare)
 {
   // Call Prepare twice
-  ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_ERROR);
+  ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_INVALID_STATE);
 }
 
 // TODO Validation check when "nnfw_run" is called without input & output tensor setting
diff --git a/tests/nnfw_api/src/ValidationTestSessionCreated.cc b/tests/nnfw_api/src/ValidationTestSessionCreated.cc
index 2675aa7..01832db 100644
--- a/tests/nnfw_api/src/ValidationTestSessionCreated.cc
+++ b/tests/nnfw_api/src/ValidationTestSessionCreated.cc
@@ -58,7 +58,7 @@ TEST_F(ValidationTestSessionCreated, neg_load_invalid_package_1)
       nnfw_load_model_from_file(
           _session, NNPackages::get().getModelAbsolutePath(NNPackages::ADD_NO_MANIFEST).c_str()),
       NNFW_STATUS_ERROR);
-  ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_ERROR);
+  ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_INVALID_STATE);
 }
 
 TEST_F(ValidationTestSessionCreated, neg_load_invalid_package_2)
@@ -67,52 +67,52 @@ TEST_F(ValidationTestSessionCreated, neg_load_invalid_package_2)
                 _session,
                 NNPackages::get().getModelAbsolutePath(NNPackages::ADD_INVALID_MANIFEST).c_str()),
             NNFW_STATUS_ERROR);
-  ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_ERROR);
+  ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_INVALID_STATE);
 }
 
 TEST_F(ValidationTestSessionCreated, neg_prepare_001)
 {
   // nnfw_load_model_from_file was not called
-  ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_ERROR);
+  ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_INVALID_STATE);
 }
 
 TEST_F(ValidationTestSessionCreated, neg_run_001)
 {
   // nnfw_load_model_from_file and nnfw_prepare was not called
-  ASSERT_EQ(nnfw_run(_session), NNFW_STATUS_ERROR);
+  ASSERT_EQ(nnfw_run(_session), NNFW_STATUS_INVALID_STATE);
 }
 
 TEST_F(ValidationTestSessionCreated, neg_set_input_001)
 {
-  // Invalid state
-  ASSERT_EQ(nnfw_set_input(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, nullptr, 0), NNFW_STATUS_ERROR);
+  ASSERT_EQ(nnfw_set_input(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, nullptr, 0),
+            NNFW_STATUS_INVALID_STATE);
 }
 
 TEST_F(ValidationTestSessionCreated, neg_set_output_001)
 {
-  // Invalid state
-  ASSERT_EQ(nnfw_set_output(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, nullptr, 0), NNFW_STATUS_ERROR);
+  ASSERT_EQ(nnfw_set_output(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, nullptr, 0),
+            NNFW_STATUS_INVALID_STATE);
 }
 
 TEST_F(ValidationTestSessionCreated, neg_get_input_size)
 {
   uint32_t size = 10000;
-  ASSERT_EQ(nnfw_input_size(_session, &size), NNFW_STATUS_ERROR);
-  ASSERT_EQ(size, 10000);
+  ASSERT_EQ(nnfw_input_size(_session, &size), NNFW_STATUS_INVALID_STATE);
+  ASSERT_EQ(size, 10000); // Remain unchanged
 }
 
 TEST_F(ValidationTestSessionCreated, neg_get_output_size)
 {
   uint32_t size = 10000;
-  ASSERT_EQ(nnfw_output_size(_session, &size), NNFW_STATUS_ERROR);
-  ASSERT_EQ(size, 10000);
+  ASSERT_EQ(nnfw_output_size(_session, &size), NNFW_STATUS_INVALID_STATE);
+  ASSERT_EQ(size, 10000); // Remain unchanged
 }
 
 TEST_F(ValidationTestSessionCreated, neg_output_tensorinfo)
 {
   nnfw_tensorinfo tensor_info;
   // model is not loaded
-  ASSERT_EQ(nnfw_output_tensorinfo(_session, 0, &tensor_info), NNFW_STATUS_ERROR);
+  ASSERT_EQ(nnfw_output_tensorinfo(_session, 0, &tensor_info), NNFW_STATUS_INVALID_STATE);
   // model is not loaded and tensor_info is null
-  ASSERT_EQ(nnfw_output_tensorinfo(_session, 0, nullptr), NNFW_STATUS_ERROR);
+  ASSERT_EQ(nnfw_output_tensorinfo(_session, 0, nullptr), NNFW_STATUS_INVALID_STATE);
 }
diff --git a/tests/scripts/benchmark_nnapi.sh b/tests/scripts/benchmark_nnapi.sh
index c7f44c5..af79728 100755
--- a/tests/scripts/benchmark_nnapi.sh
+++ b/tests/scripts/benchmark_nnapi.sh
@@ -18,7 +18,6 @@ MY_PATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 
 source $MY_PATH/common.sh
 
-BENCHMARK_RUN_TEST_SH=
 BENCHMARK_DRIVER_BIN=
 BENCHMARK_REPORT_DIR=
 BENCHMARK_MODELS_FILE=
@@ -30,7 +29,7 @@ EXECUTORS="Linear Parallel" #TODO: accept this list as argument
 
 function Usage()
 {
-    echo "Usage: ./$0 --reportdir=. --runtestsh=tests/scripts/framework/run_test.sh --driverbin=Product/out/bin/tflite_run"
+    echo "Usage: ./$0 --reportdir=. --driverbin=Product/out/bin/tflite_run"
 }
 
 for i in "$@"
@@ -43,9 +42,6 @@ do
         --test_op)
             TEST_OP="true"
             ;;
-        --runtestsh=*)
-            BENCHMARK_RUN_TEST_SH=${i#*=}
-            ;;
         --driverbin=*)
             BENCHMARK_DRIVER_BIN=${i#*=}
             ;;
@@ -147,9 +143,8 @@ function run_onert_with_all_config()
     local REPORT_MODEL_DIR=$2
     local PAUSE_TIME_IN_SEC=$3
     local BENCHMARK_DRIVER_BIN=$4
-    local BENCHMARK_RUN_TEST_SH=$5
-    local EXECUTORS=$6
-    local BACKEND_LIST=$7
+    local EXECUTORS=$5
+    local BACKEND_LIST=$6
 
     export USE_NNAPI=1
 
@@ -163,18 +158,18 @@ function run_onert_with_all_config()
     done
     export BACKENDS=$BACKENDS_TO_USE
     if [ "$TEST_OP" == "false" ]; then
-        profile_for_he_shed $REPORT_MODEL_DIR $BENCHMARK_RUN_TEST_SH $BENCHMARK_DRIVER_BIN $MODEL $PROFILING_RUN_CNT
+        profile_for_he_shed $REPORT_MODEL_DIR $BENCHMARK_DRIVER_BIN $MODEL $PROFILING_RUN_CNT
     fi
 
     for executor in $EXECUTORS; do
         export EXECUTOR=$executor
         if [ "$TEST_OP" == "false" ]; then
-            run_with_he_scheduler $REPORT_MODEL_DIR $BENCHMARK_RUN_TEST_SH $BENCHMARK_DRIVER_BIN $MODEL $executor
+            run_with_he_scheduler $REPORT_MODEL_DIR $BENCHMARK_DRIVER_BIN $MODEL $executor
         fi
         for backend in $BACKEND_LIST; do
             export OP_BACKEND_ALLOPS=$backend
             run_benchmark_and_print "tflite_onert_"$executor"_executor_$backend" "TFLite onert $executor Executor $backend"\
-                                    $MODEL $REPORT_MODEL_DIR 0 $BENCHMARK_DRIVER_BIN $BENCHMARK_RUN_TEST_SH
+                                    $MODEL $REPORT_MODEL_DIR 0 $BENCHMARK_DRIVER_BIN
         done
     done
     unset USE_NNAPI EXECUTOR OP_BACKEND_ALLOPS BACKENDS
@@ -215,14 +210,14 @@ function run_benchmark_test()
 
         # TFLite+CPU
         unset USE_NNAPI
-        run_benchmark_and_print "tflite_cpu" "TFLite CPU" $MODEL $REPORT_MODEL_DIR 0 $BENCHMARK_DRIVER_BIN $BENCHMARK_RUN_TEST_SH
+        run_benchmark_and_print "tflite_cpu" "TFLite CPU" $MODEL $REPORT_MODEL_DIR 0 $BENCHMARK_DRIVER_BIN
 
         # run onert
         if [ "$TEST_OP" == "true" ]; then
           # Operation test don't need to test each scheduler
-          run_onert_with_all_config $MODEL $REPORT_MODEL_DIR 0 $BENCHMARK_DRIVER_BIN $BENCHMARK_RUN_TEST_SH "Linear" "$BACKEND_LIST"
+          run_onert_with_all_config $MODEL $REPORT_MODEL_DIR 0 $BENCHMARK_DRIVER_BIN "Linear" "$BACKEND_LIST"
         else
-          run_onert_with_all_config $MODEL $REPORT_MODEL_DIR 0 $BENCHMARK_DRIVER_BIN $BENCHMARK_RUN_TEST_SH "$EXECUTORS" "$BACKEND_LIST"
+          run_onert_with_all_config $MODEL $REPORT_MODEL_DIR 0 $BENCHMARK_DRIVER_BIN "$EXECUTORS" "$BACKEND_LIST"
         fi
 
         if [[ $i -ne $(echo $BENCHMARK_MODEL_LIST | wc -w)-1 ]]; then
diff --git a/tests/scripts/common.sh b/tests/scripts/common.sh
index 8800290..b2799c2 100755
--- a/tests/scripts/common.sh
+++ b/tests/scripts/common.sh
@@ -18,13 +18,12 @@ MY_PATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 
 function get_result_of_benchmark_test()
 {
-    local RUN_TEST_SH=$1
-    local DRIVER_BIN=$2
-    local MODEL=$3
-    local LOG_FILE=$4
+    local DRIVER_BIN=$1
+    local MODEL=$2
+    local LOG_FILE=$3
 
     local RET=0
-    $RUN_TEST_SH --driverbin="$DRIVER_BIN  -r 5 -w 3" $MODEL > $LOG_FILE 2>&1
+    $MY_PATH/framework/run_test.sh --driverbin="$DRIVER_BIN  -r 5 -w 3" $MODEL > $LOG_FILE 2>&1
     RET=$?
     if [[ $RET -ne 0 ]]; then
         echo "Testing $MODEL aborted... exit code: $RET"
@@ -68,7 +67,7 @@ function run_benchmark_and_print()
     LOG_FILE=$REPORT_MODEL_DIR/$WRITE_FILE_NAME.txt
     RESULT_FILE=$REPORT_MODEL_DIR/$WRITE_FILE_NAME.result
     print_with_dots $MSG
-    RESULT=$(get_result_of_benchmark_test $BENCHMARK_RUN_TEST_SH $DRIVER_BIN $MODEL $LOG_FILE)
+    RESULT=$(get_result_of_benchmark_test $DRIVER_BIN $MODEL $LOG_FILE)
     echo "$RESULT ms"
     print_result_of_benchmark_test "$MSG" "$RESULT" $RESULT_FILE
     sleep $PAUSE_TIME_IN_SEC
diff --git a/tests/scripts/framework/run_test.sh b/tests/scripts/framework/run_test.sh
index 44b7149..9440c52 100755
--- a/tests/scripts/framework/run_test.sh
+++ b/tests/scripts/framework/run_test.sh
@@ -28,10 +28,12 @@ function Usage()
     echo "Usage: ./$0 --driverbin={such as tflite_run} {tests to test or empty for all of tests}"
     echo "Usage: ./$0 --driverbin=Product/out/bin/tflite_run --reportdir=report --tapname=verification.tap avgpool1 avgpool2"
     echo ""
-    echo "--download            - (default=off) Download model files. Other options is ignored"
-    echo "--driverbin           - (default=../../Product/out/bin/tflite_run) runner for runnning framework tests"
-    echo "--reportdir           - (default=report) directory to place tap files"
-    echo "--tapname             - (default=framework_test.tap) file name to be written for tap"
+    echo "--download            - (default=on) Download model files"
+    echo "--run                 - (default=on) Test model files"
+    echo "--driverbin           - (default=../../Product/out/bin/tflite_run) Runner for runnning model tests"
+    echo "--reportdir           - (default=report) Directory to place tap files"
+    echo "--tapname             - (default=framework_test.tap) File name to be written for tap"
+    echo "--md5                 - (default=on) MD5 check when download model files"
     echo ""
 }
 
@@ -43,9 +45,13 @@ function need_download()
         return 0;
     fi
     # Ignore checking md5 in cache
+    # TODO Use "--md5" option only and remove IGNORE_MD5 environment variable
     if [ ! -z $IGNORE_MD5 ] && [ "$IGNORE_MD5" == "1" ]; then
         return 1
     fi
+    if [ "$MD5_CHECK" = "off" ]; then
+        return 1
+    fi
 
     LOCAL_HASH=$(md5sum $LOCAL_PATH | awk '{ print $1 }')
     REMOTE_HASH=$(curl -ss $REMOTE_URL | md5sum  | awk '{ print $1 }')
@@ -60,7 +66,9 @@ function need_download()
 DRIVER_BIN=""
 TAP_NAME="framework_test.tap"
 TEST_LIST=()
-DOWNLOAD_MODE="off"
+DOWNLOAD_MODEL="on"
+RUN_TEST="on"
+MD5_CHECK="on"
 
 # Support environment variable setting for mirror server
 FIXED_MODELFILE_SERVER="${MODELFILE_SERVER:-}"
@@ -84,6 +92,12 @@ do
         --download=*)
             DOWNLOAD_MODE=${i#*=}
             ;;
+        --md5=*)
+            MD5_CHECK=${i#*=}
+            ;;
+        --run=*)
+            RUN_TEST=${i#*=}
+            ;;
         *)
             TEST_LIST+=( $i )
             ;;
@@ -100,7 +114,7 @@ if [ ! -n "$DRIVER_BIN" ]; then
 fi
 
 # Check test driver setting
-if [ ! -e $DRIVER_BIN ] && [ "$DOWNLOAD_MODE" != "on" ]; then
+if [ ! -e $DRIVER_BIN ] && [ "$RUN_TEST" = "on" ]; then
     echo "Cannot find test driver" $DRIVER_BIN ": please set proper DRIVER_BIN"
     exit 1
 fi
@@ -139,33 +153,9 @@ run_tests()
 
         TEST_CACHE_PATH=$CACHE_ROOT_PATH/$TEST_NAME
         MODELFILE=$TEST_CACHE_PATH/$MODELFILE_NAME
-        MODELFILE_URL="$MODELFILE_SERVER_PATH/$MODELFILE_NAME"
-        if [ -n  "$FIXED_MODELFILE_SERVER" ]; then
-            MODELFILE_URL="$FIXED_MODELFILE_SERVER/$MODELFILE_NAME"
-        fi
-
-        # Download model file
-        if [ ! -e $TEST_CACHE_PATH ]; then
-            mkdir -p $TEST_CACHE_PATH
-        fi
-
-        # Download unless we have it in cache (Also check md5sum)
-        if need_download "$MODELFILE" "$MODELFILE_URL"; then
-            echo ""
-            echo "Download test file for $TEST_NAME"
-            echo "======================"
-
-            rm -f $MODELFILE # Remove invalid file if exists
-            pushd $TEST_CACHE_PATH
-            wget -nv $MODELFILE_URL
-            if [ "${MODELFILE_NAME##*.}" == "zip" ]; then
-                unzip -o $MODELFILE_NAME
-            fi
-            popd
-        fi
 
         # Find model file for downloaded by zip
-        if [ "${MODELFILE_NAME##*.}" == "zip" ]; then
+        if [ "${MODELFILE_NAME##*.}" = "zip" ]; then
             pushd $TEST_CACHE_PATH
             MODELFILE=$TEST_CACHE_PATH/$(ls *.tflite)
             popd
@@ -178,7 +168,6 @@ run_tests()
         # Run driver to test framework
         $DRIVER_BIN $MODELFILE
 
-        #$DRIVER_BIN $MODELFILE
         if [[ $? -eq 0 ]]; then
             echo "ok $i - $TEST_NAME" >> $REPORT_DIR/$TAP_NAME
         else
@@ -268,10 +257,11 @@ find_tests()
 mkdir -p $REPORT_DIR
 TESTS_TO_RUN=$(find_tests ${TEST_LIST[@]})
 
-if [[ "$DOWNLOAD_MODE" == "on" ]]; then
+if [ "$DOWNLOAD_MODEL" = "on" ]; then
     download_tests $TESTS_TO_RUN
-    exit 0;
 fi
 
-run_tests $TESTS_TO_RUN
+if [ "$RUN_TEST" = "on" ]; then
+    run_tests $TESTS_TO_RUN
+fi
 exit $?
diff --git a/tests/scripts/test-driver.sh b/tests/scripts/test-driver.sh
index 615fc2c..a720b15 100755
--- a/tests/scripts/test-driver.sh
+++ b/tests/scripts/test-driver.sh
@@ -38,7 +38,6 @@ function Usage()
     echo "etc."
     echo "--framework_driverbin     - (default=../../Product/out/bin/tflite_run) runner for runnning framework tests"
     echo "--verification_driverbin  - (default=../../Product/out/bin/nnapi_test) runner for runnning verification tests"
-    echo "--runtestsh               - (default=\$ARTIFACT_PATH/tests/scripts/framework/run_test.sh) run_test.sh with path where it is for framework test and verification"
     echo "--unittestdir             - (default=\$ARTIFACT_PATH/Product/out/unittest) directory that has unittest binaries for unit test"
     echo ""
     echo "--reportdir               - (default=\$ARTIFACT_PATH/report) directory to save report"
@@ -49,7 +48,6 @@ TEST_DRIVER_DIR="$( cd "$( dirname "${BASH_SOURCE}" )" && pwd )"
 ARTIFACT_PATH="$TEST_DRIVER_DIR/../../"
 FRAMEWORK_DRIVER_BIN=""
 VERIFICATION_DRIVER_BIN=""
-RUN_TEST_SH=""
 UNIT_TEST_DIR=""
 ALLTEST_ON="true"
 UNITTEST_ON="false"
@@ -74,9 +72,6 @@ do
         --verification_driverbin=*)
             VERIFICATION_DRIVER_BIN=${i#*=}
             ;;
-        --runtestsh=*)
-            RUN_TEST_SH=${i#*=}
-            ;;
         --unittestdir=*)
             UNIT_TEST_DIR=${i#*=}
             ;;
@@ -116,15 +111,6 @@ done
 
 ARTIFACT_PATH="$(readlink -f $ARTIFACT_PATH)"
 
-if [ -z "$RUN_TEST_SH" ]; then
-    RUN_TEST_SH=$ARTIFACT_PATH/tests/scripts/framework/run_test.sh
-fi
-
-if [ ! -e "$RUN_TEST_SH" ]; then
-    echo "Cannot find $RUN_TEST_SH"
-    exit 1
-fi
-
 if [ -z "$UNIT_TEST_DIR" ]; then
     UNIT_TEST_DIR=$ARTIFACT_PATH/Product/out/unittest
 fi
@@ -149,7 +135,6 @@ if [ "$FRAMEWORKTEST_ON" == "true" ]; then
     fi
 
     $TEST_DRIVER_DIR/test_framework.sh \
-        --runtestsh=$RUN_TEST_SH \
         --driverbin=$FRAMEWORK_DRIVER_BIN \
         --reportdir=$REPORT_DIR \
         --tapname=framework_test.tap \
@@ -166,7 +151,6 @@ if [ "$ALLTEST_ON" == "true" ] || [ "$VERIFICATION_ON" == "true" ]; then
 
     # verification uses the same script as frameworktest does
     $TEST_DRIVER_DIR/test_framework.sh \
-        --runtestsh=$RUN_TEST_SH \
         --driverbin=$VERIFICATION_DRIVER_BIN \
         --reportdir=$REPORT_DIR \
         --tapname=verification_test.tap \
@@ -180,7 +164,6 @@ if [ "$BENCHMARK_ONERT_OP_ON" == "true" ]; then
 
     $TEST_DRIVER_DIR/benchmark_nnapi.sh \
         --test_op \
-        --runtestsh=$RUN_TEST_SH \
         --driverbin=$DRIVER_BIN \
         --reportdir=$REPORT_DIR/benchmark_op \
         --modelfilepath=$ARTIFACT_PATH/tests/scripts/framework
diff --git a/tests/scripts/test_framework.sh b/tests/scripts/test_framework.sh
index 1d97515..bd86cd3 100755
--- a/tests/scripts/test_framework.sh
+++ b/tests/scripts/test_framework.sh
@@ -14,7 +14,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-FWTEST_RUN_TEST_SH=
+MY_PATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+
 FWTEST_DRIVER_BIN=
 FWTEST_REPORT_DIR=
 FWTEST_TAP_NAME=
@@ -25,7 +26,6 @@ function Usage()
 {
     echo "Usage Example:"
     echo "./$0 \\"
-    echo "  --runtestsh=tests/scripts/framework/run_test.sh \\ # Test runner script path"
     echo "  --driverbin=Product/out/bin/tflite_run \\  # Test driver path"
     echo "  --frameworktest_list_file=tests/scripts/list/frameworktest_list.armv7l.cpu.txt \\"
     echo "  --reportdir=report \\            # Directory for the report files will be saved"
@@ -42,9 +42,6 @@ do
         -h|--help|help)
             Usage
             ;;
-        --runtestsh=*)
-            FWTEST_RUN_TEST_SH=${i#*=}
-            ;;
         --driverbin=*)
             FWTEST_DRIVER_BIN=${i#*=}
             ;;
@@ -67,7 +64,6 @@ do
     shift
 done
 
-[ ! -z "$FWTEST_RUN_TEST_SH" ] || Usage
 [ ! -z "$FWTEST_DRIVER_BIN" ] || Usage
 [ ! -z "$FWTEST_REPORT_DIR" ] || Usage
 [ ! -z "$FWTEST_TAP_NAME" ] || Usage
@@ -86,7 +82,7 @@ if [ ! -z "$FRAMEWORKTEST_LIST_FILE" ]; then
     MODELLIST=$(cat "${FRAMEWORKTEST_LIST_FILE}")
 fi
 
-$FWTEST_RUN_TEST_SH --driverbin=$FWTEST_DRIVER_BIN \
+$MY_PATH/framework/run_test.sh --driverbin=$FWTEST_DRIVER_BIN \
     --reportdir=$FWTEST_REPORT_DIR \
     --tapname=$FWTEST_TAP_NAME \
     ${MODELLIST:-} \
diff --git a/tests/tools/nnpackage_run/CMakeLists.txt b/tests/tools/nnpackage_run/CMakeLists.txt
index 0e333a0..ec45db4 100644
--- a/tests/tools/nnpackage_run/CMakeLists.txt
+++ b/tests/tools/nnpackage_run/CMakeLists.txt
@@ -33,7 +33,7 @@ target_include_directories(nnpackage_run PRIVATE src)
 target_include_directories(nnpackage_run PRIVATE ${Boost_INCLUDE_DIRS})
 
 target_link_libraries(nnpackage_run onert_core onert tflite_loader)
-target_link_libraries(nnpackage_run tensorflow-lite ${LIB_PTHREAD} dl nnfw_lib_tflite jsoncpp)
+target_link_libraries(nnpackage_run nnfw_lib_tflite jsoncpp)
 target_link_libraries(nnpackage_run nnfw-dev)
 target_link_libraries(nnpackage_run ${Boost_PROGRAM_OPTIONS_LIBRARY})
 target_link_libraries(nnpackage_run nnfw_lib_benchmark)
diff --git a/tests/tools/nnpackage_run/src/args.cc b/tests/tools/nnpackage_run/src/args.cc
index 0dbcafc..cb4a7db 100644
--- a/tests/tools/nnpackage_run/src/args.cc
+++ b/tests/tools/nnpackage_run/src/args.cc
@@ -16,6 +16,7 @@
 
 #include "args.h"
 
+#include <functional>
 #include <iostream>
 #include <json/json.h>
 
@@ -105,6 +106,75 @@ Args::Args(const int argc, char **argv)
 
 void Args::Initialize(void)
 {
+  auto process_nnpackage = [&](const std::string &package_filename) {
+    _package_filename = package_filename;
+
+    std::cerr << "Package Filename " << _package_filename << std::endl;
+    if (_package_filename.empty())
+    {
+      // TODO Print usage instead of the below message
+      std::cerr << "Please specify nnpackage file. Run with `--help` for usage."
+                << "\n";
+
+      exit(1);
+    }
+    else
+    {
+      if (access(_package_filename.c_str(), F_OK) == -1)
+      {
+        std::cerr << "nnpackage not found: " << _package_filename << "\n";
+      }
+    }
+  };
+
+  auto process_output_sizes = [&](const std::string &output_sizes_json_str) {
+    Json::Value root;
+    Json::Reader reader;
+    if (!reader.parse(output_sizes_json_str, root, false))
+    {
+      std::cerr << "Invalid JSON format for output_sizes \"" << output_sizes_json_str << "\"\n";
+      exit(1);
+    }
+
+    auto arg_map = argArrayToMap(root);
+    for (auto &pair : arg_map)
+    {
+      uint32_t key = pair.first;
+      Json::Value &val_json = pair.second;
+      if (!val_json.isUInt())
+      {
+        std::cerr << "All the values in `output_sizes` must be unsigned integers\n";
+        exit(1);
+      }
+      uint32_t val = val_json.asUInt();
+      _output_sizes[key] = val;
+    }
+  };
+
+  auto process_shape_prepare = [&](const std::string &shape_str) {
+    try
+    {
+      handleShapeParam(_shape_prepare, shape_str);
+    }
+    catch (const std::exception &e)
+    {
+      std::cerr << "error with '--shape_prepare' option: " << shape_str << std::endl;
+      exit(1);
+    }
+  };
+
+  auto process_shape_run = [&](const std::string &shape_str) {
+    try
+    {
+      handleShapeParam(_shape_run, shape_str);
+    }
+    catch (const std::exception &e)
+    {
+      std::cerr << "error with '--shape_run' option: " << shape_str << std::endl;
+      exit(1);
+    }
+  };
+
   // General options
   po::options_description general("General options", 100);
 
@@ -112,32 +182,33 @@ void Args::Initialize(void)
   general.add_options()
     ("help,h", "Print available options")
     ("version", "Print version and exit immediately")
-    ("nnpackage", po::value<std::string>()->required())
+    ("nnpackage", po::value<std::string>()->required()->notifier(process_nnpackage))
 #if defined(ONERT_HAVE_HDF5) && ONERT_HAVE_HDF5 == 1
-    ("dump,d", po::value<std::string>()->default_value(""), "Output filename")
-    ("load,l", po::value<std::string>()->default_value(""), "Input filename")
+    ("dump,d", po::value<std::string>()->default_value("")->notifier([&](const auto &v) { _dump_filename = v; }), "Output filename")
+    ("load,l", po::value<std::string>()->default_value("")->notifier([&](const auto &v) { _load_filename = v; }), "Input filename")
 #endif
-    ("output_sizes", po::value<std::string>(),
+    ("output_sizes", po::value<std::string>()->notifier(process_output_sizes),
         "The output buffer size in JSON 1D array\n"
         "If not given, the model's output sizes are used\n"
         "e.g. '[0, 40, 2, 80]' to set 0th tensor to 40 and 2nd tensor to 80.\n")
-    ("num_runs,r", po::value<int>()->default_value(1), "The number of runs")
-    ("warmup_runs,w", po::value<int>()->default_value(0), "The number of warmup runs")
-    ("run_delay,t", po::value<int>()->default_value(-1), "Delay time(ms) between runs (as default no delay")
-    ("gpumem_poll,g", po::value<bool>()->default_value(false), "Check gpu memory polling separately")
-    ("mem_poll,m", po::value<bool>()->default_value(false), "Check memory polling")
-    ("write_report,p", po::value<bool>()->default_value(false),
+    ("num_runs,r", po::value<int>()->default_value(1)->notifier([&](const auto &v) { _num_runs = v; }), "The number of runs")
+    ("warmup_runs,w", po::value<int>()->default_value(0)->notifier([&](const auto &v) { _warmup_runs = v; }), "The number of warmup runs")
+    ("run_delay,t", po::value<int>()->default_value(-1)->notifier([&](const auto &v) { _run_delay = v; }), "Delay time(ms) between runs (as default no delay")
+    ("gpumem_poll,g", po::value<bool>()->default_value(false)->notifier([&](const auto &v) { _gpumem_poll = v; }), "Check gpu memory polling separately")
+    ("mem_poll,m", po::value<bool>()->default_value(false)->notifier([&](const auto &v) { _mem_poll = v; }), "Check memory polling")
+    ("write_report,p", po::value<bool>()->default_value(false)->notifier([&](const auto &v) { _write_report = v; }),
          "Write report\n"
          "{exec}-{nnpkg}-{backend}.csv will be generated.\n"
          "e.g. nnpackage_run-UNIT_Add_000-acl_cl.csv.\n"
          "{nnpkg} name may be changed to realpath if you use symbolic-link.")
-    ("shape_prepare", po::value<std::string>()->default_value("[]"),
+    ("shape_prepare", po::value<std::string>()->default_value("[]")->notifier(process_shape_prepare),
          "set shape of specified tensor before compilation\n"
          "e.g. '[0, [1, 2], 2, []]' to set 0th tensor to [1, 2] and 2nd tensor to [].\n")
-    ("shape_run", po::value<std::string>()->default_value("[]"),
+    ("shape_run", po::value<std::string>()->default_value("[]")->notifier(process_shape_run),
          "set shape of specified tensor right before running\n"
          "e.g. '[1, [1, 2]]` to set 1st tensor to [1, 2].\n")
-    ("verbose_level,v", po::value<int>()->default_value(0), "Verbose level\n"
+    ("verbose_level,v", po::value<int>()->default_value(0)->notifier([&](const auto &v) { _verbose_level = v; }),
+         "Verbose level\n"
          "0: prints the only result. Messages btw run don't print\n"
          "1: prints result and message btw run\n"
          "2: prints all of messages to print\n")
@@ -180,158 +251,23 @@ void Args::Parse(const int argc, char **argv)
     return;
   }
 
-  po::notify(vm);
   try
   {
-#if defined(ONERT_HAVE_HDF5) && ONERT_HAVE_HDF5 == 1
-    if (vm.count("dump"))
-    {
-      _dump_filename = vm["dump"].as<std::string>();
-    }
-
-    if (vm.count("load"))
-    {
-      _load_filename = vm["load"].as<std::string>();
-    }
-#endif
-
-    if (vm.count("nnpackage"))
-    {
-      _package_filename = vm["nnpackage"].as<std::string>();
-
-      if (_package_filename.empty())
-      {
-        // TODO Print usage instead of the below message
-        std::cerr << "Please specify nnpackage file. Run with `--help` for usage."
-                  << "\n";
-
-        exit(1);
-      }
-      else
-      {
-        if (access(_package_filename.c_str(), F_OK) == -1)
-        {
-          std::cerr << "nnpackage not found: " << _package_filename << "\n";
-        }
-      }
-    }
-
-    if (vm.count("output_sizes"))
-    {
-      auto output_sizes_json_str = vm["output_sizes"].as<std::string>();
-
-      Json::Value root;
-      Json::Reader reader;
-      if (!reader.parse(output_sizes_json_str, root, false))
-      {
-        std::cerr << "Invalid JSON format for output_sizes \"" << output_sizes_json_str << "\"\n";
-        exit(1);
-      }
-
-      auto arg_map = argArrayToMap(root);
-      for (auto &pair : arg_map)
-      {
-        uint32_t key = pair.first;
-        Json::Value &val_json = pair.second;
-        if (!val_json.isUInt())
-        {
-          std::cerr << "All the values in `output_sizes` must be unsigned integers\n";
-          exit(1);
-        }
-        uint32_t val = val_json.asUInt();
-        _output_sizes[key] = val;
-      }
-    }
-
-    if (vm.count("num_runs"))
-    {
-      _num_runs = vm["num_runs"].as<int>();
-    }
-
-    if (vm.count("warmup_runs"))
-    {
-      _warmup_runs = vm["warmup_runs"].as<int>();
-    }
-
-    if (vm.count("run_delay"))
-    {
-      _run_delay = vm["run_delay"].as<int>();
-    }
-
-    if (vm.count("gpumem_poll"))
-    {
-      _gpumem_poll = vm["gpumem_poll"].as<bool>();
-    }
-
-    if (vm.count("mem_poll"))
-    {
-      _mem_poll = vm["mem_poll"].as<bool>();
-      // Instead of EXECUTE to avoid overhead, memory polling runs on WARMUP
-      if (_mem_poll && _warmup_runs == 0)
-      {
-        _warmup_runs = 1;
-      }
-    }
-
-    if (vm.count("write_report"))
-    {
-      _write_report = vm["write_report"].as<bool>();
-    }
-
-    if (vm.count("verbose_level"))
-    {
-      _verbose_level = vm["verbose_level"].as<int>();
-    }
+    po::notify(vm);
   }
   catch (const std::bad_cast &e)
   {
-    std::cerr << "error by bad cast" << e.what() << '\n';
+    std::cerr << "Bad cast error - " << e.what() << '\n';
     exit(1);
   }
 
-  if (vm.count("shape_prepare"))
-  {
-    std::string shape_str;
-    try
-    {
-      shape_str = vm["shape_prepare"].as<std::string>();
-    }
-    catch (const std::bad_cast &e)
-    {
-      std::cerr << "error by bad cast with '--shape_prepare' option" << e.what() << '\n';
-      exit(1);
-    }
-    try
-    {
-      handleShapeParam(_shape_prepare, shape_str);
-    }
-    catch (const std::exception &e)
-    {
-      std::cerr << "error with '--shape_prepare' option: " << shape_str << std::endl;
-      exit(1);
-    }
-  }
-
-  if (vm.count("shape_run"))
+  // This must be run after `notify` as `_warm_up_runs` must have been processed before.
+  if (vm.count("mem_poll"))
   {
-    std::string shape_str;
-    try
-    {
-      shape_str = vm["shape_run"].as<std::string>();
-    }
-    catch (const std::bad_cast &e)
+    // Instead of EXECUTE to avoid overhead, memory polling runs on WARMUP
+    if (_mem_poll && _warmup_runs == 0)
     {
-      std::cerr << "error by bad cast with '--shape_run' option" << e.what() << '\n';
-      exit(1);
-    }
-    try
-    {
-      handleShapeParam(_shape_run, shape_str);
-    }
-    catch (const std::exception &e)
-    {
-      std::cerr << "error with '--shape_run' option: " << shape_str << std::endl;
-      exit(1);
+      _warmup_runs = 1;
     }
   }
 }
diff --git a/tests/tools/nnpackage_run/src/h5formatter.cc b/tests/tools/nnpackage_run/src/h5formatter.cc
index 34c075c..09ace47 100644
--- a/tests/tools/nnpackage_run/src/h5formatter.cc
+++ b/tests/tools/nnpackage_run/src/h5formatter.cc
@@ -145,6 +145,7 @@ void H5Formatter::dumpOutputs(const std::string &filename, std::vector<Allocatio
           data_set.write(outputs[i].data(), H5::PredType::NATIVE_INT64);
           break;
         }
+        case NNFW_TYPE_TENSOR_UINT8:
         case NNFW_TYPE_TENSOR_QUANT8_ASYMM:
         {
           H5::DataSet data_set =
@@ -159,13 +160,6 @@ void H5Formatter::dumpOutputs(const std::string &filename, std::vector<Allocatio
           data_set.write(outputs[i].data(), H5::PredType::NATIVE_INT8);
           break;
         }
-        case NNFW_TYPE_TENSOR_UINT8:
-        {
-          H5::DataSet data_set =
-              value_group.createDataSet(std::to_string(i), H5::PredType::STD_U8BE, data_space);
-          data_set.write(outputs[i].data(), H5::PredType::NATIVE_UINT8);
-          break;
-        }
         default:
           throw std::runtime_error("nnpkg_run can dump f32, i32, qasymm8, bool and uint8.");
       }
diff --git a/tests/tools/tflite_loader/CMakeLists.txt b/tests/tools/tflite_loader/CMakeLists.txt
index 5a9e3a8..0fe1c69 100644
--- a/tests/tools/tflite_loader/CMakeLists.txt
+++ b/tests/tools/tflite_loader/CMakeLists.txt
@@ -17,7 +17,7 @@ add_executable(tflite_loader_test_tool ${SOURCES})
 target_include_directories(tflite_loader_test_tool PRIVATE ${Boost_INCLUDE_DIRS})
 
 target_link_libraries(tflite_loader_test_tool onert_core onert tflite_loader)
-target_link_libraries(tflite_loader_test_tool nnfw_lib_tflite tensorflow-lite ${LIB_PTHREAD} dl nnfw_lib_misc)
+target_link_libraries(tflite_loader_test_tool nnfw_lib_tflite nnfw_lib_misc)
 target_link_libraries(tflite_loader_test_tool ${Boost_PROGRAM_OPTIONS_LIBRARY} ${Boost_SYSTEM_LIBRARY} ${Boost_FILESYSTEM_LIBRARY})
 
 install(TARGETS tflite_loader_test_tool DESTINATION bin)
diff --git a/tests/tools/tflite_run/CMakeLists.txt b/tests/tools/tflite_run/CMakeLists.txt
index 19e7126..3f30d3e 100644
--- a/tests/tools/tflite_run/CMakeLists.txt
+++ b/tests/tools/tflite_run/CMakeLists.txt
@@ -13,7 +13,7 @@ add_executable(tflite_run ${TFLITE_RUN_SRCS})
 target_include_directories(tflite_run PRIVATE src)
 target_include_directories(tflite_run PRIVATE ${Boost_INCLUDE_DIRS})
 
-target_link_libraries(tflite_run tensorflow-lite ${LIB_PTHREAD} dl nnfw_lib_tflite)
+target_link_libraries(tflite_run nnfw_lib_tflite)
 target_link_libraries(tflite_run ${Boost_PROGRAM_OPTIONS_LIBRARY})
 
 target_link_libraries(tflite_run nnfw_lib_benchmark)
diff --git a/tools/nnpackage_tool/nncc-tc-to-nnpkg-tc/nncc-tc-to-nnpkg-tc.sh b/tools/nnpackage_tool/nncc-tc-to-nnpkg-tc/nncc-tc-to-nnpkg-tc.sh
index cf3e544..bbc5b3e 100755
--- a/tools/nnpackage_tool/nncc-tc-to-nnpkg-tc/nncc-tc-to-nnpkg-tc.sh
+++ b/tools/nnpackage_tool/nncc-tc-to-nnpkg-tc/nncc-tc-to-nnpkg-tc.sh
@@ -62,6 +62,7 @@ tflite
 "
 
 model_type=""
+tf_intf_version=""
 
 for ext in $supported_model_types; do
   [ -e "$indir/$tcname"."$ext" ] && model_type=$ext
@@ -73,7 +74,9 @@ if [[ "$model_type" == "" ]]; then
 fi
 
 if [[ "$model_type" == "pb" ]]; then
-  $tf2nnpkg --info "$indir/$tcname".info --graphdef "$indir/$tcname"."$model_type" -o "$outdir"
+  [ -f "$indir/$tcname"."v2" ] && tf_intf_version="--v2"
+  $tf2nnpkg --info "$indir/$tcname".info --graphdef "$indir/$tcname"."$model_type" \
+  "$tf_intf_version" -o "$outdir"
 else
   $model2nnpkg -o "$outdir" "$indir/$tcname"."$model_type"
 fi
diff --git a/tools/tflitefile_tool/select_operator.py b/tools/tflitefile_tool/select_operator.py
index 1ad44a3..333ca32 100755
--- a/tools/tflitefile_tool/select_operator.py
+++ b/tools/tflitefile_tool/select_operator.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 
 # Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
 #
@@ -1180,23 +1180,6 @@ def GenerateModel(args, new_builder, sample_model, operator_list, new_input_tens
     return tflite.Model.ModelEnd(new_builder)
 
 
-def Finish(new_builder, new_model):
-    # Cusrom implementation: identifier
-    # Python API don't support identifier input yet
-    # Reference: Finish(self, rootTable)) in builder.py, Finish(uoffset_t root, const char *file_identifier, bool size_prefix) in flatbuffers.h
-    new_builder.Prep(new_builder.minalign,
-                     flatbuffers.number_types.UOffsetTFlags.bytewidth)
-
-    new_builder.PrependByte(0x33)
-    new_builder.PrependByte(0x4c)
-    new_builder.PrependByte(0x46)
-    new_builder.PrependByte(0x54)
-
-    new_builder.PrependUOffsetTRelative(new_model)
-    new_builder.finished = True
-    return new_builder.Head()
-
-
 def main(args):
     input_model_file = args.input_model
     oplist_file = args.opcode_list
@@ -1343,7 +1326,7 @@ def main(args):
                               new_input_tensors, new_output_tensors, used_tensors_dic,
                               used_buffers_dic, used_opcodes_dic, used_subgraphs_dic)
 
-    Finish(new_builder, new_model)
+    new_builder.Finish(new_model, file_identifier=b'TFL3')
     new_buf = new_builder.Output()
 
     output_model_file.write(new_buf)
diff --git a/tools/tflkit/README.md b/tools/tflkit/README.md
index a0c40c6..9e18834 100644
--- a/tools/tflkit/README.md
+++ b/tools/tflkit/README.md
@@ -1,4 +1,4 @@
-# tflkit
+ï»¿# tflkit
 
 ## Purpose
 
@@ -114,11 +114,11 @@ Number of all operators                       :  126 	 (total instrs: 11,484,469
 
 ### TensorFlow
 
-TensorFlow provides some kinds of converting guideline. In Python, the [TFLiteConverter](https://www.tensorflow.org/api_docs/python/tf/contrib/lite/TFLiteConverter) class will help you to convert a TensorFlow GraphDef or SavedModel into `output_format` using TOCO. The `output_format` can be `TFLITE` or `GRAPHVIZ_DOT` format. The default `output_format` is `TFLITE`. And there is a Python command line interface for running TOCO, and its name is [tflite_convert](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/python/tflite_convert.py). This converts a TensorFlow GraphDef or SavedModel into `TFLITE` or `GRAPHVIZ_DOT` format like [TFLiteConverter](https://www.tensorflow.org/api_docs/python/tf/contrib/lite/TFLiteConverter). These two way also supports to convert a TensorFlow Keras model into `output_format`. Both functions are implemented using a tool called TOCO.
+TensorFlow provides some kinds of converting guideline. In Python, the [TFLiteConverter](https://www.tensorflow.org/api_docs/python/tf/lite/TFLiteConverter) class will help you to convert a TensorFlow GraphDef or SavedModel into `output_format` using TOCO. The `output_format` can be `TFLITE` or `GRAPHVIZ_DOT` format. The default `output_format` is `TFLITE`. And there is a Python command line interface for running TOCO, and its name is [tflite_convert](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/python/tflite_convert.py). This converts a TensorFlow GraphDef or SavedModel into `TFLITE` or `GRAPHVIZ_DOT` format like [TFLiteConverter](https://www.tensorflow.org/api_docs/python/tf/lite/TFLiteConverter). These two way also supports to convert a TensorFlow Keras model into `output_format`. Both functions are implemented using a tool called TOCO.
 
 ### with tflkit
 
-The tflkit uses the [tflite_convert](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/python/tflite_convert.py) python command line interface to convert a TensorFlow model into TfLite model. It only supports to convert a TensorFlow GraphDef file into `TFLITE` format file. This tool supports the creation of individual `TFLITE` files for different input shapes. When converting to multiple `TFLITE` files, it needs to put a string called `NAME` in `TFLITE_PATH`. The string `NAME` will be replaced by what is listed in teh `NAME` environment. This tool requires an information file as a parameter. There is an [example file](info/convert.template) for a convert information. The `--tensorflow_path` and `--tensorflow_version` can change the TensorFlow location. By default, it uses `externals/tensorflow` directory.
+The tflkit uses the [tflite_convert](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/python/tflite_convert.py) python command line interface to convert a TensorFlow model into TfLite model. It only supports to convert a TensorFlow GraphDef file into `TFLITE` format file. This tool supports the creation of individual `TFLITE` files for different input shapes. When converting to multiple `TFLITE` files, it needs to put a string called `NAME` in `TFLITE_PATH`. The string `NAME` will be replaced by what is listed in the `NAME` environment. This tool requires an information file as a parameter. There is an [example file](convert.template) for a convert information. The `--tensorflow_path` and `--tensorflow_version` can change the TensorFlow location. By default, it uses `externals/tensorflow` directory.
 
 Convert information:
   * GRAPHDEF_PATH : Full filepath of file containing frozen TensorFlow GraphDef.
@@ -176,7 +176,7 @@ The input and output file of this tool is a TensorFlow GraphDef file.
 
 ### with tflkit
 
-The [optimize_for_inference.sh](optimize_for_inference.sh) file invokes the TensorFlow [optimize tool](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/optimize_for_inference.py). This tool requires a optimize information file as a parameter. Here is an [example file](info/optimize.template) for this tool. The information file needs `INPUT` and `OUTPUT` array names. The [summarize_pb.sh](summarize_pb.sh) file will help you to define the `INPUT` and `OUTPUT` array names. The `--tensorflow_path` can change the TensorFlow location. By default, it uses `externals/tensorflow` directory.
+The [optimize_for_inference.sh](optimize_for_inference.sh) file invokes the TensorFlow [optimize tool](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/optimize_for_inference.py). This tool requires a optimize information file as a parameter. Here is an [example file](optimize.template) for this tool. The information file needs `INPUT` and `OUTPUT` array names. The [summarize_pb.sh](summarize_pb.sh) file will help you to define the `INPUT` and `OUTPUT` array names. The `--tensorflow_path` can change the TensorFlow location. By default, it uses `externals/tensorflow` directory.
 
 Optimize information:
   * GRAPHDEF_PATH : Full filepath of file containing frozen TensorFlow GraphDef.
@@ -207,7 +207,7 @@ The trained TensorFlow model can be trasformed by some variants to deploy it in
 
 ### with tflkit
 
-The [transform_graph.sh](transform_graph.sh) file supports to transform a TensorFlow GraphDef using various transform options. This tool requires a transform information file as a parameter and the transform options are described in the information file. There is an [example file](info/transform.template) for this tool. The information file needs `INPUT` and `OUTPUT` array names. The [summarize_pb.sh](summarize_pb.sh) file will help you to define the `INPUT` and `OUTPUT` array names. The `--tensorflow_path` can change the TensorFlow location. By default, it uses `externals/tensorflow` directory.
+The [transform_graph.sh](transform_graph.sh) file supports to transform a TensorFlow GraphDef using various transform options. This tool requires a transform information file as a parameter and the transform options are described in the information file. There is an [example file](transform.template) for this tool. The information file needs `INPUT` and `OUTPUT` array names. The [summarize_pb.sh](summarize_pb.sh) file will help you to define the `INPUT` and `OUTPUT` array names. The `--tensorflow_path` can change the TensorFlow location. By default, it uses `externals/tensorflow` directory.
 
 Transform information:
   * GRAPHDEF_PATH : Full filepath of file containing frozen TensorFlow GraphDef.
@@ -270,7 +270,7 @@ The [freeze_graph](https://github.com/tensorflow/tensorflow/blob/master/tensorfl
 
 ### with tflkit
 
-The tflkit provides the simple way to create a frozen graph using [freeze_graph](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/freeze_graph.py) tool. This tool requires an information file as a parameter. There is an [example file](info/freeze.info) for a freeze tool. Either `SAVED_MODEL` or `META_GRAPH` must be declared. And `META_GRAPH` is always used with `CKPT_PATH`. The `--tensorflow_path` can change the TensorFlow location. By default, it uses `externals/tensorflow` directory.
+The tflkit provides the simple way to create a frozen graph using [freeze_graph](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/freeze_graph.py) tool. This tool requires an information file as a parameter. There is an [example file](freeze.template) for a freeze tool. Either `SAVED_MODEL` or `META_GRAPH` must be declared. And `META_GRAPH` is always used with `CKPT_PATH`. The `--tensorflow_path` can change the TensorFlow location. By default, it uses `externals/tensorflow` directory.
 
 Freeze information:
   * SAVED_MODEL : Full directory path with TensorFlow `SavedModel` file and variables.
diff --git a/tools/update_version/update-version b/tools/update_version/update-version
index 4169327..1b77c10 100644
--- a/tools/update_version/update-version
+++ b/tools/update_version/update-version
@@ -40,11 +40,12 @@ fi
 
 version=$1
 
-sed -i "s/^release = .*/release = \'$version\'/" ${nnfw_root}/docs/conf.py
-sed -i "s/^Version: .*/Version: $version/" ${nnfw_root}/packaging/nnfw.spec
+perl -pi -e "s/^release = .*/release = \'$version\'/" ${nnfw_root}/docs/conf.py
 
-IFS=. read M m p <<< $version
+perl -pi -e "s/^Version: .*/Version: $version/" ${nnfw_root}/packaging/nnfw.spec
+
+IFS=. read M m p <<< "$version"
 hex=$(printf '0x%08x' $(( (($M << 24)) | (($m << 8)) | $p )))
-sed -i "s/^#define NNFW_VERSION.*/#define NNFW_VERSION $hex/" ${nnfw_root}/runtime/onert/api/include/nnfw_version.h
+perl -pi -e "s/^#define NNFW_VERSION.*/#define NNFW_VERSION $hex/" ${nnfw_root}/runtime/onert/api/include/nnfw_version.h
 
-sed -i "s/versionName .*$/versionName \"$version\"/" ${nnfw_root}/runtime/contrib/android/api/build.gradle
+perl -pi -e "s/versionName .*$/versionName \"$version\"/" ${nnfw_root}/runtime/contrib/android/api/build.gradle
-- 
2.7.4