From ec4ea6a482da92f5335fb325d47e245194dfbc72 Mon Sep 17 00:00:00 2001 From: =?utf8?q?=EC=9C=A4=ED=98=84=EC=8B=9D/=EB=8F=99=EC=9E=91=EC=A0=9C?= =?utf8?q?=EC=96=B4Lab=28SR=29/Principal=20Engineer/=EC=82=BC=EC=84=B1?= =?utf8?q?=EC=A0=84=EC=9E=90?= Date: Mon, 20 Aug 2018 13:43:04 +0900 Subject: [PATCH] [PureACL] Add NEON Operations (#2343) PureACL was modified to run CL or NEON operations. (Provide "NEON=1" as env variable to run NEON operations) Three NEON operations were tested: `Add`, `Sub`, `Mul` (8 generated tests were passed). Signed-off-by: Hyun Sik Yoon --- runtimes/pure_arm_compute/src/compilation.cc | 543 +++++++++++++++------ .../pure_arm_compute/src/internal/arm_compute.cc | 36 +- .../pure_arm_compute/src/internal/arm_compute.h | 28 +- .../src/internal/arm_compute/Cast.h | 2 +- .../src/internal/layers/FeatureLoggingLayer.h | 24 +- .../src/internal/layers/GenericReshapeLayer.cc | 35 +- .../src/internal/layers/GenericReshapeLayer.h | 20 +- .../src/internal/layers/SimpleArithmeticAddition.h | 35 +- .../src/internal/layers/SimpleCastLayer.h | 29 +- 9 files changed, 554 insertions(+), 198 deletions(-) diff --git a/runtimes/pure_arm_compute/src/compilation.cc b/runtimes/pure_arm_compute/src/compilation.cc index d2be05f..7ab6ba9 100644 --- a/runtimes/pure_arm_compute/src/compilation.cc +++ b/runtimes/pure_arm_compute/src/compilation.cc @@ -1,7 +1,5 @@ #include -#include - #include #include #include @@ -24,6 +22,13 @@ #include #include +#include +#include +#include +#include +#include + +#include "internal/arm_compute.h" #include "internal/arm_compute/Cast.h" #include "internal/arm_compute/matrix/View.h" #include "internal/arm_compute/kernel/View.h" @@ -150,7 +155,7 @@ struct IAllocationContext { virtual ~IAllocationContext() = default; - virtual ::arm_compute::ICLTensor *at(const ::internal::tflite::operand::Index &ind) const = 0; + virtual ::arm_compute::ITensor *at(const ::internal::tflite::operand::Index &ind) const = 0; }; #include "internal/IExecutionBuilder.h" @@ -271,67 +276,105 @@ public: } private: - void appendReLU(::arm_compute::ICLTensor *tensor); - void appendReLU6(::arm_compute::ICLTensor *tensor); - void appendReLU1(::arm_compute::ICLTensor *tensor); - void appendTanh(::arm_compute::ICLTensor *tensor); + void appendReLU(::arm_compute::ITensor *tensor); + void appendReLU6(::arm_compute::ITensor *tensor); + void appendReLU1(::arm_compute::ITensor *tensor); + void appendTanh(::arm_compute::ITensor *tensor); public: - void append(FuseCode code, ::arm_compute::ICLTensor *tensor); + void append(FuseCode code, ::arm_compute::ITensor *tensor); private: IExecutionBuilder &_builder; }; -void ActivationBuilder::appendReLU(::arm_compute::ICLTensor *ifm_alloc) +void ActivationBuilder::appendReLU(::arm_compute::ITensor *ifm_alloc) { const ::arm_compute::ActivationLayerInfo act_info{ ::arm_compute::ActivationLayerInfo::ActivationFunction::RELU}; - auto fn = nnfw::make_unique<::arm_compute::CLActivationLayer>(); + if (::internal::arm_compute::isGpuMode()) + { + auto fn = nnfw::make_unique<::arm_compute::CLActivationLayer>(); - fn->configure(ifm_alloc, nullptr, act_info); + fn->configure(CAST_CL(ifm_alloc), nullptr, act_info); - _builder.append("ReLU", std::move(fn)); + _builder.append("ReLU", std::move(fn)); + } + else + { + auto fn = nnfw::make_unique<::arm_compute::NEActivationLayer>(); + + fn->configure(ifm_alloc, nullptr, act_info); + + _builder.append("ReLU", std::move(fn)); + } } -void ActivationBuilder::appendReLU1(::arm_compute::ICLTensor *ifm_alloc) +void ActivationBuilder::appendReLU1(::arm_compute::ITensor *ifm_alloc) { const ::arm_compute::ActivationLayerInfo act_info{ ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1.0f, -1.0f}; - auto fn = nnfw::make_unique<::arm_compute::CLActivationLayer>(); + if (::internal::arm_compute::isGpuMode()) + { + auto fn = nnfw::make_unique<::arm_compute::CLActivationLayer>(); - fn->configure(ifm_alloc, nullptr, act_info); + fn->configure(CAST_CL(ifm_alloc), nullptr, act_info); - _builder.append("ReLU1", std::move(fn)); + _builder.append("ReLU1", std::move(fn)); + } + else + { + auto fn = nnfw::make_unique<::arm_compute::NEActivationLayer>(); + + fn->configure(ifm_alloc, nullptr, act_info); + + _builder.append("ReLU1", std::move(fn)); + } } -void ActivationBuilder::appendReLU6(::arm_compute::ICLTensor *ifm_alloc) +void ActivationBuilder::appendReLU6(::arm_compute::ITensor *ifm_alloc) { const ::arm_compute::ActivationLayerInfo act_info{ ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.0f, 0.0f}; - auto fn = nnfw::make_unique<::arm_compute::CLActivationLayer>(); + if (::internal::arm_compute::isGpuMode()) + { + auto fn = nnfw::make_unique<::arm_compute::CLActivationLayer>(); + + fn->configure(CAST_CL(ifm_alloc), nullptr, act_info); + + _builder.append("ReLU6", std::move(fn)); + } + else + { + auto fn = nnfw::make_unique<::arm_compute::NEActivationLayer>(); - fn->configure(ifm_alloc, nullptr, act_info); + fn->configure(ifm_alloc, nullptr, act_info); - _builder.append("ReLU6", std::move(fn)); + _builder.append("ReLU6", std::move(fn)); + } } -void ActivationBuilder::appendTanh(::arm_compute::ICLTensor *ifm_alloc) +void ActivationBuilder::appendTanh(::arm_compute::ITensor *ifm_alloc) { const ::arm_compute::ActivationLayerInfo act_info{ ::arm_compute::ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f}; - auto fn = nnfw::make_unique<::arm_compute::CLActivationLayer>(); + if (::internal::arm_compute::isGpuMode()) + { + auto fn = nnfw::make_unique<::arm_compute::CLActivationLayer>(); - fn->configure(ifm_alloc, nullptr, act_info); + fn->configure(CAST_CL(ifm_alloc), nullptr, act_info); - _builder.append("Tanh", std::move(fn)); + _builder.append("Tanh", std::move(fn)); + } + else + throw std::runtime_error("Not supported, yet"); } -void ActivationBuilder::append(FuseCode code, ::arm_compute::ICLTensor *ifm_alloc) +void ActivationBuilder::append(FuseCode code, ::arm_compute::ITensor *ifm_alloc) { switch (code) { @@ -463,18 +506,31 @@ void Planner::visit(const ::internal::tflite::op::Add::Node &node) auto l = nnfw::make_unique(); - l->configure(lhs_alloc, rhs_alloc, ofm_alloc); + l->configure(CAST_CL(lhs_alloc), CAST_CL(rhs_alloc), CAST_CL(ofm_alloc)); fn = std::move(l); } else { - auto l = nnfw::make_unique<::arm_compute::CLArithmeticAddition>(); + if (::internal::arm_compute::isGpuMode()) + { + auto l = nnfw::make_unique<::arm_compute::CLArithmeticAddition>(); - // TODO Decide ConvertPolicy (WARP? SATURATE?) according to NN API specification - l->configure(lhs_alloc, rhs_alloc, ofm_alloc, ::arm_compute::ConvertPolicy::SATURATE); + // TODO Decide ConvertPolicy (WARP? SATURATE?) according to NN API specification + l->configure(CAST_CL(lhs_alloc), CAST_CL(rhs_alloc), CAST_CL(ofm_alloc), + ::arm_compute::ConvertPolicy::SATURATE); - fn = std::move(l); + fn = std::move(l); + } + else // NEON + { + auto l = nnfw::make_unique<::arm_compute::NEArithmeticAddition>(); + + // TODO Decide ConvertPolicy (WARP? SATURATE?) according to NN API specification + l->configure(lhs_alloc, rhs_alloc, ofm_alloc, ::arm_compute::ConvertPolicy::SATURATE); + + fn = std::move(l); + } } builder.append("Add", std::move(fn)); @@ -523,14 +579,27 @@ void Planner::visit(const ::internal::tflite::op::Sub::Node &node) auto lhs_alloc = ctx.at(::internal::tflite::operand::Index{param.lhs_index}); auto rhs_alloc = ctx.at(::internal::tflite::operand::Index{param.rhs_index}); - auto fn = nnfw::make_unique<::arm_compute::CLArithmeticSubtraction>(); + if (::internal::arm_compute::isGpuMode()) + { + auto fn = nnfw::make_unique<::arm_compute::CLArithmeticSubtraction>(); + + // TODO Decide ConvertPolicy (WARP? SATURATE?) according to NN API specification + fn->configure(CAST_CL(lhs_alloc), CAST_CL(rhs_alloc), CAST_CL(ofm_alloc), + ::arm_compute::ConvertPolicy::SATURATE); + + builder.append("Sub", std::move(fn)); + } + else // NEON + { + auto fn = nnfw::make_unique<::arm_compute::NEArithmeticSubtraction>(); - // TODO Decide ConvertPolicy (WARP? SATURATE?) according to NN API specification - fn->configure(lhs_alloc, rhs_alloc, ofm_alloc, ::arm_compute::ConvertPolicy::SATURATE); + // TODO Decide ConvertPolicy (WARP? SATURATE?) according to NN API specification + fn->configure(lhs_alloc, rhs_alloc, ofm_alloc, ::arm_compute::ConvertPolicy::SATURATE); - builder.append("Sub", std::move(fn)); + builder.append("Sub", std::move(fn)); + } - ActivationBuilder{builder}.append(param.activation, ofm_alloc); + ActivationBuilder{builder}.append(param.activation, CAST_CL(ofm_alloc)); }; _builder.addStage(stage); @@ -577,14 +646,27 @@ void Planner::visit(const ::internal::tflite::op::Mul::Node &node) auto lhs_input_alloc = ctx.at(::internal::tflite::operand::Index{param.lhs_index}); auto rhs_input_alloc = ctx.at(::internal::tflite::operand::Index{param.rhs_index}); - auto fn = nnfw::make_unique<::arm_compute::CLPixelWiseMultiplication>(); + if (::internal::arm_compute::isGpuMode()) + { + auto fn = nnfw::make_unique<::arm_compute::CLPixelWiseMultiplication>(); + + fn->configure(CAST_CL(lhs_input_alloc), CAST_CL(rhs_input_alloc), CAST_CL(output_alloc), + 1.0, // scale + arm_compute::ConvertPolicy::SATURATE, + arm_compute::RoundingPolicy::TO_NEAREST_EVEN); - fn->configure(lhs_input_alloc, rhs_input_alloc, output_alloc, - 1.0, // scale - arm_compute::ConvertPolicy::SATURATE, - arm_compute::RoundingPolicy::TO_NEAREST_EVEN); + builder.append("Mul", std::move(fn)); + } + else // NEON + { + auto fn = nnfw::make_unique<::arm_compute::NEPixelWiseMultiplication>(); - builder.append("Mul", std::move(fn)); + fn->configure(CAST_NE(lhs_input_alloc), CAST_NE(rhs_input_alloc), CAST_NE(output_alloc), + 1.0, // scale + arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_ZERO); + + builder.append("Mul", std::move(fn)); + } ActivationBuilder{builder}.append(param.activation, output_alloc); }; @@ -654,13 +736,18 @@ void Planner::visit(const ::internal::tflite::op::Div::Node &node) auto lhs_alloc = ctx.at(::internal::tflite::operand::Index{param.lhs_index}); auto rhs_alloc = ctx.at(::internal::tflite::operand::Index{param.rhs_index}); - auto fn = nnfw::make_unique<::arm_compute::CLPixelWiseDivision>(); + if (::internal::arm_compute::isGpuMode()) + { + auto fn = nnfw::make_unique<::arm_compute::CLPixelWiseDivision>(); - // TODO Decide scale, overflow_policy, and rounding_policy. - // Currently, the default values are used. - fn->configure(lhs_alloc, rhs_alloc, ofm_alloc); + // TODO Decide scale, overflow_policy, and rounding_policy. + // Currently, the default values are used. + fn->configure(CAST_CL(lhs_alloc), CAST_CL(rhs_alloc), CAST_CL(ofm_alloc)); - builder.append("Div", std::move(fn)); + builder.append("Div", std::move(fn)); + } + else + throw std::runtime_error("Not supported, yet"); ActivationBuilder{builder}.append(param.activation, ofm_alloc); }; @@ -802,11 +889,17 @@ void Planner::visit(const ::internal::tflite::op::Conv2D::Implicit::Node &node) const auto conv_info = asPadStringInfo(param.padding, param.stride); - std::unique_ptr<::arm_compute::CLConvolutionLayer> fn{new ::arm_compute::CLConvolutionLayer}; + if (::internal::arm_compute::isGpuMode()) + { + std::unique_ptr<::arm_compute::CLConvolutionLayer> fn{new ::arm_compute::CLConvolutionLayer}; - fn->configure(ifm_alloc, ker_alloc, bias_alloc, ofm_alloc, conv_info); + fn->configure(CAST_CL(ifm_alloc), CAST_CL(ker_alloc), CAST_CL(bias_alloc), CAST_CL(ofm_alloc), + conv_info); - builder.append("Conv2D", std::move(fn)); + builder.append("Conv2D", std::move(fn)); + } + else + throw std::runtime_error("Not supported, yet"); ActivationBuilder{builder}.append(param.activation, ofm_alloc); }; @@ -957,11 +1050,17 @@ void Planner::visit(const ::internal::tflite::op::Conv2D::Explicit::Node &node) const auto conv_info = asPadStringInfo(param.padding, param.stride); - std::unique_ptr<::arm_compute::CLConvolutionLayer> fn{new ::arm_compute::CLConvolutionLayer}; + if (::internal::arm_compute::isGpuMode()) + { + std::unique_ptr<::arm_compute::CLConvolutionLayer> fn{new ::arm_compute::CLConvolutionLayer}; - fn->configure(ifm_alloc, ker_alloc, bias_alloc, ofm_alloc, conv_info); + fn->configure(CAST_CL(ifm_alloc), CAST_CL(ker_alloc), CAST_CL(bias_alloc), CAST_CL(ofm_alloc), + conv_info); - builder.append("Conv2D", std::move(fn)); + builder.append("Conv2D", std::move(fn)); + } + else + throw std::runtime_error("Not supported, yet"); ActivationBuilder{builder}.append(param.activation, ofm_alloc); }; @@ -1082,11 +1181,17 @@ void Planner::visit(const ::internal::tflite::op::DepthwiseConv2D::Implicit::Nod const auto conv_info = asPadStringInfo(param.padding, param.stride); - auto fn = nnfw::make_unique<::arm_compute::CLDepthwiseConvolutionLayer>(); + if (::internal::arm_compute::isGpuMode()) + { + auto fn = nnfw::make_unique<::arm_compute::CLDepthwiseConvolutionLayer>(); - fn->configure(ifm_alloc, ker_alloc, bias_alloc, ofm_alloc, conv_info, param.multipler); + fn->configure(CAST_CL(ifm_alloc), CAST_CL(ker_alloc), CAST_CL(bias_alloc), CAST_CL(ofm_alloc), + conv_info, param.multipler); - builder.append("DepthwiseConv2D", std::move(fn)); + builder.append("DepthwiseConv2D", std::move(fn)); + } + else + throw std::runtime_error("Not supported, yet"); ActivationBuilder{builder}.append(param.activation, ofm_alloc); }; @@ -1212,11 +1317,17 @@ void Planner::visit(const ::internal::tflite::op::DepthwiseConv2D::Explicit::Nod const auto conv_info = asPadStringInfo(param.padding, param.stride); - auto fn = nnfw::make_unique<::arm_compute::CLDepthwiseConvolutionLayer>(); + if (::internal::arm_compute::isGpuMode()) + { + auto fn = nnfw::make_unique<::arm_compute::CLDepthwiseConvolutionLayer>(); - fn->configure(ifm_alloc, ker_alloc, bias_alloc, ofm_alloc, conv_info, param.multipler); + fn->configure(CAST_CL(ifm_alloc), CAST_CL(ker_alloc), CAST_CL(bias_alloc), CAST_CL(ofm_alloc), + conv_info, param.multipler); - builder.append("DepthwiseConv2D", std::move(fn)); + builder.append("DepthwiseConv2D", std::move(fn)); + } + else + throw std::runtime_error("Not supported, yet"); ActivationBuilder{builder}.append(param.activation, ofm_alloc); }; @@ -1271,13 +1382,17 @@ void Planner::visit(const ::internal::tflite::op::Dequantize::Node &node) l->configure(input_alloc, output_alloc); fn = std::move(l); } - else + else // Use the OpenCL version of CAST operation { - // Use the OpenCL version of CAST operation - auto l = nnfw::make_unique<::arm_compute::CLCast>(); + if (::internal::arm_compute::isGpuMode()) + { + auto l = nnfw::make_unique<::arm_compute::CLCast>(); - l->configure(input_alloc, output_alloc); - fn = std::move(l); + l->configure(CAST_CL(input_alloc), CAST_CL(output_alloc)); + fn = std::move(l); + } + else + throw std::runtime_error("Not supported, yet"); } builder.append("Dequantize", std::move(fn)); @@ -1372,11 +1487,16 @@ void Planner::visit(const ::internal::tflite::op::MaxPool2D::Implicit::Node &nod ::arm_compute::Size2D{param.kw, param.kh}, asPadStringInfo(param.padding, param.stride)}; - std::unique_ptr<::arm_compute::CLPoolingLayer> fn{new ::arm_compute::CLPoolingLayer}; + if (::internal::arm_compute::isGpuMode()) + { + std::unique_ptr<::arm_compute::CLPoolingLayer> fn{new ::arm_compute::CLPoolingLayer}; - fn->configure(ifm_alloc, ofm_alloc, info); + fn->configure(CAST_CL(ifm_alloc), CAST_CL(ofm_alloc), info); - builder.append("MaxPool2D", std::move(fn)); + builder.append("MaxPool2D", std::move(fn)); + } + else + throw std::runtime_error("Not supported, yet"); ActivationBuilder{builder}.append(param.activation, ofm_alloc); }; @@ -1476,11 +1596,16 @@ void Planner::visit(const ::internal::tflite::op::MaxPool2D::Explicit::Node &nod ::arm_compute::Size2D{param.kw, param.kh}, asPadStringInfo(param.padding, param.stride)}; - std::unique_ptr<::arm_compute::CLPoolingLayer> fn{new ::arm_compute::CLPoolingLayer}; + if (::internal::arm_compute::isGpuMode()) + { + std::unique_ptr<::arm_compute::CLPoolingLayer> fn{new ::arm_compute::CLPoolingLayer}; - fn->configure(ifm_alloc, ofm_alloc, info); + fn->configure(CAST_CL(ifm_alloc), CAST_CL(ofm_alloc), info); - builder.append("MaxPool2D", std::move(fn)); + builder.append("MaxPool2D", std::move(fn)); + } + else + throw std::runtime_error("Not supported, yet"); ActivationBuilder{builder}.append(param.activation, ofm_alloc); }; @@ -1576,11 +1701,16 @@ void Planner::visit(const ::internal::tflite::op::AvgPool2D::Implicit::Node &nod ::arm_compute::PoolingType::AVG, ::arm_compute::Size2D{param.kw, param.kh}, asPadStringInfo(param.padding, param.stride), true /* exclude_padding */}; - std::unique_ptr<::arm_compute::CLPoolingLayer> fn{new ::arm_compute::CLPoolingLayer}; + if (::internal::arm_compute::isGpuMode()) + { + std::unique_ptr<::arm_compute::CLPoolingLayer> fn{new ::arm_compute::CLPoolingLayer}; - fn->configure(ifm_alloc, ofm_alloc, info); + fn->configure(CAST_CL(ifm_alloc), CAST_CL(ofm_alloc), info); - builder.append("AvgPool2D", std::move(fn)); + builder.append("AvgPool2D", std::move(fn)); + } + else + throw std::runtime_error("Not supported, yet"); ActivationBuilder{builder}.append(param.activation, ofm_alloc); }; @@ -1680,11 +1810,16 @@ void Planner::visit(const ::internal::tflite::op::AvgPool2D::Explicit::Node &nod ::arm_compute::PoolingType::AVG, ::arm_compute::Size2D{param.kw, param.kh}, asPadStringInfo(param.padding, param.stride), true /* exclude_padding */}; - std::unique_ptr<::arm_compute::CLPoolingLayer> fn{new ::arm_compute::CLPoolingLayer}; + if (::internal::arm_compute::isGpuMode()) + { + std::unique_ptr<::arm_compute::CLPoolingLayer> fn{new ::arm_compute::CLPoolingLayer}; - fn->configure(ifm_alloc, ofm_alloc, info); + fn->configure(CAST_CL(ifm_alloc), CAST_CL(ofm_alloc), info); - builder.append("AvgPool2D", std::move(fn)); + builder.append("AvgPool2D", std::move(fn)); + } + else + throw std::runtime_error("Not supported, yet"); ActivationBuilder{builder}.append(param.activation, ofm_alloc); }; @@ -1941,11 +2076,17 @@ void Planner::visit(const ::internal::tflite::op::FullyConnected::Node &node) auto weight_alloc = ctx.at(::internal::tflite::operand::Index{param.weight_index}); auto bias_alloc = ctx.at(::internal::tflite::operand::Index{param.bias_index}); - auto fn = nnfw::make_unique<::arm_compute::CLFullyConnectedLayer>(); + if (::internal::arm_compute::isGpuMode()) + { + auto fn = nnfw::make_unique<::arm_compute::CLFullyConnectedLayer>(); - fn->configure(input_alloc, weight_alloc, bias_alloc, output_alloc); + fn->configure(CAST_CL(input_alloc), CAST_CL(weight_alloc), CAST_CL(bias_alloc), + CAST_CL(output_alloc)); - builder.append("FullyConnected", std::move(fn)); + builder.append("FullyConnected", std::move(fn)); + } + else + throw std::runtime_error("Not supported, yet"); ActivationBuilder{builder}.append(param.activation, output_alloc); }; @@ -1989,13 +2130,19 @@ void Planner::visit(const ::internal::tflite::op::ResizeBilinear::Node &node) auto ofm_alloc = ctx.at(::internal::tflite::operand::Index{param.ofm_index}); auto ifm_alloc = ctx.at(::internal::tflite::operand::Index{param.ifm_index}); - auto fn = nnfw::make_unique<::arm_compute::CLScale>(); + if (::internal::arm_compute::isGpuMode()) + { + auto fn = nnfw::make_unique<::arm_compute::CLScale>(); - fn->configure(ifm_alloc, ofm_alloc, ::arm_compute::InterpolationPolicy::BILINEAR, - ::arm_compute::BorderMode::REPLICATE, ::arm_compute::PixelValue(0.f), - ::arm_compute::SamplingPolicy::TOP_LEFT); + fn->configure(CAST_CL(ifm_alloc), CAST_CL(ofm_alloc), + ::arm_compute::InterpolationPolicy::BILINEAR, + ::arm_compute::BorderMode::REPLICATE, ::arm_compute::PixelValue(0.f), + ::arm_compute::SamplingPolicy::TOP_LEFT); - builder.append("ResizeBilinear", std::move(fn)); + builder.append("ResizeBilinear", std::move(fn)); + } + else + throw std::runtime_error("Not supported, yet"); }; _builder.addStage(stage); @@ -2034,12 +2181,17 @@ void Planner::visit(const ::internal::tflite::op::Reshape::Node &node) auto output_alloc = ctx.at(::internal::tflite::operand::Index{param.output_index}); auto input_alloc = ctx.at(::internal::tflite::operand::Index{param.input_index}); - // GenericReshape first apply NCHW->NHWC permutation, and apply reshape - auto fn = nnfw::make_unique(); + if (::internal::arm_compute::isGpuMode()) + { + // GenericReshape first apply NCHW->NHWC permutation, and apply reshape + auto fn = nnfw::make_unique(); - fn->configure(input_alloc, output_alloc); + fn->configure(CAST_CL(input_alloc), CAST_CL(output_alloc)); - builder.append("Reshape", std::move(fn)); + builder.append("Reshape", std::move(fn)); + } + else + throw std::runtime_error("Not supported, yet"); }; _builder.addStage(stage); @@ -2082,10 +2234,15 @@ void Planner::visit(const ::internal::tflite::op::Squeeze::Node &node) auto output_alloc = ctx.at(::internal::tflite::operand::Index{param.output_index}); auto input_alloc = ctx.at(::internal::tflite::operand::Index{param.input_index}); - auto fn = nnfw::make_unique<::arm_compute::CLReshapeLayer>(); - fn->configure(input_alloc, output_alloc); + if (::internal::arm_compute::isGpuMode()) + { + auto fn = nnfw::make_unique<::arm_compute::CLReshapeLayer>(); + fn->configure(CAST_CL(input_alloc), CAST_CL(output_alloc)); - builder.append("Squeeze", std::move(fn)); + builder.append("Squeeze", std::move(fn)); + } + else + throw std::runtime_error("Not supported, yet"); }; _builder.addStage(stage); @@ -2127,11 +2284,16 @@ void Planner::visit(const ::internal::tflite::op::Softmax::Node &node) auto output_alloc = ctx.at(::internal::tflite::operand::Index{param.output_index}); auto input_alloc = ctx.at(::internal::tflite::operand::Index{param.input_index}); - auto fn = nnfw::make_unique<::arm_compute::CLSoftmaxLayer>(); + if (::internal::arm_compute::isGpuMode()) + { + auto fn = nnfw::make_unique<::arm_compute::CLSoftmaxLayer>(); - fn->configure(input_alloc, output_alloc, param.scale); + fn->configure(CAST_CL(input_alloc), CAST_CL(output_alloc), param.scale); - builder.append("Softmax", std::move(fn)); + builder.append("Softmax", std::move(fn)); + } + else + throw std::runtime_error("Not supported, yet"); }; _builder.addStage(stage); @@ -2228,12 +2390,18 @@ void Planner::visit(const ::internal::tflite::op::StridedSlice::Node &node) auto endData_alloc = ctx.at(::internal::tflite::operand::Index{param.endData_index}); auto stridesData_alloc = ctx.at(::internal::tflite::operand::Index{param.stridesData_index}); - auto fn = nnfw::make_unique<::arm_compute::CLStridedSlice>(); + if (::internal::arm_compute::isGpuMode()) + { + auto fn = nnfw::make_unique<::arm_compute::CLStridedSlice>(); - fn->configure(inputData_alloc, outputData_alloc, startData_alloc, endData_alloc, - stridesData_alloc, param.beginMask, param.endMask, param.shrinkAxisMask); + fn->configure(CAST_CL(inputData_alloc), CAST_CL(outputData_alloc), CAST_CL(startData_alloc), + CAST_CL(endData_alloc), CAST_CL(stridesData_alloc), param.beginMask, + param.endMask, param.shrinkAxisMask); - builder.append("StridedSlice", std::move(fn)); + builder.append("StridedSlice", std::move(fn)); + } + else + throw std::runtime_error("Not supported, yet"); }; _builder.addStage(stage); @@ -2288,11 +2456,16 @@ void Planner::visit(const ::internal::tflite::op::ReduceMax::Node &node) auto ofm_alloc = ctx.at(::internal::tflite::operand::Index{param.ofm_index}); auto ifm_alloc = ctx.at(::internal::tflite::operand::Index{param.ifm_index}); - auto fn = nnfw::make_unique<::arm_compute::CLReduceMax>(); + if (::internal::arm_compute::isGpuMode()) + { + auto fn = nnfw::make_unique<::arm_compute::CLReduceMax>(); - fn->configure(ifm_alloc, param.axis, ofm_alloc); + fn->configure(CAST_CL(ifm_alloc), param.axis, CAST_CL(ofm_alloc)); - builder.append("ReduceMax", std::move(fn)); + builder.append("ReduceMax", std::move(fn)); + } + else + throw std::runtime_error("Not supported, yet"); }; _builder.addStage(stage); @@ -2341,13 +2514,17 @@ void Planner::visit(const ::internal::tflite::op::Cast::Node &node) l->configure(input_alloc, output_alloc); fn = std::move(l); } - else + else // Use the OpenCL version of CAST operation { - // Use the OpenCL version of CAST operation - auto l = nnfw::make_unique<::arm_compute::CLCast>(); + if (::internal::arm_compute::isGpuMode()) + { + auto l = nnfw::make_unique<::arm_compute::CLCast>(); - l->configure(input_alloc, output_alloc); - fn = std::move(l); + l->configure(CAST_CL(input_alloc), CAST_CL(output_alloc)); + fn = std::move(l); + } + else + throw std::runtime_error("Not supported, yet"); } builder.append("Cast", std::move(fn)); @@ -2403,11 +2580,16 @@ void Planner::visit(const ::internal::tflite::op::TopKV2::Node &node) auto indices_alloc = ctx.at(::internal::tflite::operand::Index{param.outputIndices_index}); auto input_alloc = ctx.at(::internal::tflite::operand::Index{param.inputData_index}); - auto fn = nnfw::make_unique<::arm_compute::CLTopKV2>(); + if (::internal::arm_compute::isGpuMode()) + { + auto fn = nnfw::make_unique<::arm_compute::CLTopKV2>(); - fn->configure(input_alloc, param.k, values_alloc, indices_alloc); + fn->configure(CAST_CL(input_alloc), param.k, CAST_CL(values_alloc), CAST_CL(indices_alloc)); - builder.append("TopKV2", std::move(fn)); + builder.append("TopKV2", std::move(fn)); + } + else + throw std::runtime_error("Not supported, yet"); }; _builder.addStage(stage); @@ -2458,12 +2640,17 @@ void Planner::visit(const ::internal::tflite::op::Gather::Node &node) auto lhs_alloc = ctx.at(::internal::tflite::operand::Index{param.lhs_index}); auto rhs_alloc = ctx.at(::internal::tflite::operand::Index{param.rhs_index}); - std::unique_ptr<::arm_compute::IFunction> fn; + if (::internal::arm_compute::isGpuMode()) + { + std::unique_ptr<::arm_compute::IFunction> fn; - auto l = nnfw::make_unique<::arm_compute::CLGather>(); - l->configure(lhs_alloc, rhs_alloc, ofm_alloc); - fn = std::move(l); - builder.append("Gather", std::move(fn)); + auto l = nnfw::make_unique<::arm_compute::CLGather>(); + l->configure(CAST_CL(lhs_alloc), CAST_CL(rhs_alloc), CAST_CL(ofm_alloc)); + fn = std::move(l); + builder.append("Gather", std::move(fn)); + } + else + throw std::runtime_error("Not supported, yet"); }; _builder.addStage(stage); @@ -2502,11 +2689,16 @@ void Planner::visit(const ::internal::tflite::op::ReLU::Node &node) const ::arm_compute::ActivationLayerInfo act_info{ ::arm_compute::ActivationLayerInfo::ActivationFunction::RELU}; - auto fn = nnfw::make_unique<::arm_compute::CLActivationLayer>(); + if (::internal::arm_compute::isGpuMode()) + { + auto fn = nnfw::make_unique<::arm_compute::CLActivationLayer>(); - fn->configure(ifm_alloc, ofm_alloc, act_info); + fn->configure(CAST_CL(ifm_alloc), CAST_CL(ofm_alloc), act_info); - builder.append("ReLU", std::move(fn)); + builder.append("ReLU", std::move(fn)); + } + else + throw std::runtime_error("Not supported, yet"); }; _builder.addStage(stage); @@ -2545,11 +2737,16 @@ void Planner::visit(const ::internal::tflite::op::ReLU1::Node &node) const ::arm_compute::ActivationLayerInfo act_info{ ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1.0f, -1.0f}; - auto fn = nnfw::make_unique<::arm_compute::CLActivationLayer>(); + if (::internal::arm_compute::isGpuMode()) + { + auto fn = nnfw::make_unique<::arm_compute::CLActivationLayer>(); - fn->configure(ifm_alloc, ofm_alloc, act_info); + fn->configure(CAST_CL(ifm_alloc), CAST_CL(ofm_alloc), act_info); - builder.append("ReLU1", std::move(fn)); + builder.append("ReLU1", std::move(fn)); + } + else + throw std::runtime_error("Not supported, yet"); }; _builder.addStage(stage); @@ -2588,11 +2785,16 @@ void Planner::visit(const ::internal::tflite::op::ReLU6::Node &node) const ::arm_compute::ActivationLayerInfo act_info{ ::arm_compute::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.0f}; - auto fn = nnfw::make_unique<::arm_compute::CLActivationLayer>(); + if (::internal::arm_compute::isGpuMode()) + { + auto fn = nnfw::make_unique<::arm_compute::CLActivationLayer>(); - fn->configure(ifm_alloc, ofm_alloc, act_info); + fn->configure(CAST_CL(ifm_alloc), CAST_CL(ofm_alloc), act_info); - builder.append("ReLU6", std::move(fn)); + builder.append("ReLU6", std::move(fn)); + } + else + throw std::runtime_error("Not supported, yet"); }; _builder.addStage(stage); @@ -2629,11 +2831,16 @@ void Planner::visit(const ::internal::tflite::op::Tanh::Node &node) const ::arm_compute::ActivationLayerInfo act_info{ ::arm_compute::ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f}; - auto fn = nnfw::make_unique<::arm_compute::CLActivationLayer>(); + if (::internal::arm_compute::isGpuMode()) + { + auto fn = nnfw::make_unique<::arm_compute::CLActivationLayer>(); - fn->configure(ifm_alloc, ofm_alloc, act_info); + fn->configure(CAST_CL(ifm_alloc), CAST_CL(ofm_alloc), act_info); - builder.append("Tanh", std::move(fn)); + builder.append("Tanh", std::move(fn)); + } + else + throw std::runtime_error("Not supported, yet"); }; _builder.addStage(stage); @@ -2672,11 +2879,16 @@ void Planner::visit(const ::internal::tflite::op::Logistic::Node &node) const ::arm_compute::ActivationLayerInfo act_info{ ::arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC}; - auto fn = nnfw::make_unique<::arm_compute::CLActivationLayer>(); + if (::internal::arm_compute::isGpuMode()) + { + auto fn = nnfw::make_unique<::arm_compute::CLActivationLayer>(); - fn->configure(ifm_alloc, ofm_alloc, act_info); + fn->configure(CAST_CL(ifm_alloc), CAST_CL(ofm_alloc), act_info); - builder.append("Logistic", std::move(fn)); + builder.append("Logistic", std::move(fn)); + } + else + throw std::runtime_error("Not supported, yet"); }; _builder.addStage(stage); @@ -2759,11 +2971,16 @@ void Planner::visit(const ::internal::tflite::op::Mean::Node &node) auto ofm_alloc = ctx.at(::internal::tflite::operand::Index{param.ofm_index}); auto ifm_alloc = ctx.at(::internal::tflite::operand::Index{param.ifm_index}); - auto fn = nnfw::make_unique<::arm_compute::CLReductionMean>(); + if (::internal::arm_compute::isGpuMode()) + { + auto fn = nnfw::make_unique<::arm_compute::CLReductionMean>(); - fn->configure(ifm_alloc, ofm_alloc, param.axis); + fn->configure(CAST_CL(ifm_alloc), CAST_CL(ofm_alloc), param.axis); - builder.append("Mean", std::move(fn)); + builder.append("Mean", std::move(fn)); + } + else + throw std::runtime_error("Not supported, yet"); }; _builder.addStage(stage); @@ -2778,7 +2995,7 @@ public: } public: - ::arm_compute::ICLTensor *at(const ::internal::tflite::operand::Index &ind) const override + ::arm_compute::ITensor *at(const ::internal::tflite::operand::Index &ind) const override { return _plan.operands().at(ind).ptr(); } @@ -2949,10 +3166,10 @@ void PlanBuilder::addStage(const Stage &stage) { _stages.emplace_back(stage); } void PlanBuilder::finalize(void) const { - // CLTensor objects to be initialized later - std::vector> tensors; + // ITensor objects to be initialized later + std::vector> tensors; - // Create CLTensor & CLSubTensor + // Create Tensor & CLSubTensor auto isAllocated = [this](int ind) { const ::internal::tflite::operand::Index operand_index{ind}; return _plan.operands().exist(operand_index); @@ -2975,8 +3192,31 @@ void PlanBuilder::finalize(void) const assert(base_tensor != nullptr); - auto curr_tensor = std::make_shared<::arm_compute::CLSubTensor>(base_tensor, sub_info.shape(), - sub_info.offset()); + auto curr_tensor = std::make_shared<::arm_compute::CLSubTensor>( + CAST_CL(base_tensor), sub_info.shape(), sub_info.offset()); + + _plan.operands().set(::internal::tflite::operand::Index{curr}, curr_tensor); + }; + + auto setNETensor = [&](int ind) { + auto tensor = std::make_shared<::arm_compute::Tensor>(); + + tensor->allocator()->init(_tensor_info_ctx.at(ind)); + + // NOTE Do NOT allocate here. allocate should be invoked after configure functions + _plan.operands().set(::internal::tflite::operand::Index{ind}, tensor); + tensors.emplace_back(tensor); + }; + + auto setNESubTensor = [&](int curr) { + const auto &sub_info = *(_subsumption_ctx.find(curr)->second); + + auto base_tensor = _plan.operands().at(sub_info.base()).ptr(); + + assert(base_tensor != nullptr); + + auto curr_tensor = std::make_shared<::arm_compute::SubTensor>(base_tensor, sub_info.shape(), + sub_info.offset()); _plan.operands().set(::internal::tflite::operand::Index{curr}, curr_tensor); }; @@ -3002,7 +3242,10 @@ void PlanBuilder::finalize(void) const if (it_s == _subsumption_ctx.end()) { - setCLTensor(curr); + if (::internal::arm_compute::isGpuMode()) + setCLTensor(curr); + else + setNETensor(curr); stack.pop(); continue; } @@ -3011,7 +3254,10 @@ void PlanBuilder::finalize(void) const if (isAllocated(sub_info.base().asInt())) { - setCLSubTensor(curr); + if (::internal::arm_compute::isGpuMode()) + setCLSubTensor(curr); + else + setNESubTensor(curr); stack.pop(); } else @@ -3030,7 +3276,10 @@ void PlanBuilder::finalize(void) const continue; } - setCLTensor(it->first); + if (::internal::arm_compute::isGpuMode()) + setCLTensor(it->first); + else + setNETensor(it->first); } // Process Stage @@ -3045,7 +3294,16 @@ void PlanBuilder::finalize(void) const // Allocate Tensor Memory for (const auto &tensor : tensors) { - tensor->allocator()->allocate(); + if (::internal::arm_compute::isGpuMode()) + { + auto cl_tensor = CAST_CL(tensor.get()); + cl_tensor->allocator()->allocate(); + } + else + { + auto ne_tensor = CAST_NE(tensor.get()); + ne_tensor->allocator()->allocate(); + } } // Fill weight/bias @@ -3259,7 +3517,8 @@ int ANeuralNetworksCompilation_finish(ANeuralNetworksCompilation *compilation) return ANEURALNETWORKS_UNEXPECTED_NULL; } - arm_compute::CLScheduler::get().default_init(); + if (::internal::arm_compute::isGpuMode()) + arm_compute::CLScheduler::get().default_init(); const auto &operands = compilation->plan().model().operands(); const auto &operations = compilation->plan().model().operations(); diff --git a/runtimes/pure_arm_compute/src/internal/arm_compute.cc b/runtimes/pure_arm_compute/src/internal/arm_compute.cc index 394a64c..689510c 100644 --- a/runtimes/pure_arm_compute/src/internal/arm_compute.cc +++ b/runtimes/pure_arm_compute/src/internal/arm_compute.cc @@ -13,11 +13,19 @@ namespace operand void Object::access(const std::function &fn) const { - auto &queue = ::arm_compute::CLScheduler::get().queue(); + if (::internal::arm_compute::isGpuMode()) + { + auto &queue = ::arm_compute::CLScheduler::get().queue(); - _tensor->map(queue); - fn(*_tensor); - _tensor->unmap(queue); + auto cl_tensor = _tensor.get(); + CAST_CL(cl_tensor)->map(queue); + fn(*_tensor); + CAST_CL(cl_tensor)->unmap(queue); + } + else + { + fn(*_tensor); + } } } // namespace operand @@ -32,7 +40,7 @@ namespace operand { Context &Context::set(const ::internal::tflite::operand::Index &id, - const std::shared_ptr<::arm_compute::ICLTensor> &tensor) + const std::shared_ptr<::arm_compute::ITensor> &tensor) { assert(_objects.find(id.asInt()) == _objects.end()); @@ -43,3 +51,21 @@ Context &Context::set(const ::internal::tflite::operand::Index &id, } // namespace operand } // namepsace arm_compute } // namespace internal + +namespace internal +{ +namespace arm_compute +{ + +bool isGpuMode() +{ + char *neon = std::getenv("NEON"); + if (neon == nullptr) + return true; + else if (neon[0] == '1') + return false; + return true; +} + +} // namepsace arm_compute +} // namespace internal diff --git a/runtimes/pure_arm_compute/src/internal/arm_compute.h b/runtimes/pure_arm_compute/src/internal/arm_compute.h index 8310faf..cacdfce 100644 --- a/runtimes/pure_arm_compute/src/internal/arm_compute.h +++ b/runtimes/pure_arm_compute/src/internal/arm_compute.h @@ -1,7 +1,9 @@ #ifndef __INTERNAL_ARM_COMPUTE_H__ #define __INTERNAL_ARM_COMPUTE_H__ -#include +#include +#include +#include namespace internal { @@ -16,16 +18,16 @@ public: Object() = default; public: - Object(const std::shared_ptr<::arm_compute::ICLTensor> &tensor) : _tensor{tensor} + Object(const std::shared_ptr<::arm_compute::ITensor> &tensor) : _tensor{tensor} { // DO NOTHING } public: - ::arm_compute::ICLTensor *ptr(void) const { return _tensor.get(); } + ::arm_compute::ITensor *ptr(void) const { return _tensor.get(); } private: - std::shared_ptr<::arm_compute::ICLTensor> _tensor; + std::shared_ptr<::arm_compute::ITensor> _tensor; public: void access(const std::function &fn) const; @@ -50,7 +52,7 @@ class Context { public: Context &set(const ::internal::tflite::operand::Index &ind, - const std::shared_ptr<::arm_compute::ICLTensor> &tensor); + const std::shared_ptr<::arm_compute::ITensor> &tensor); public: bool exist(const ::internal::tflite::operand::Index &ind) const @@ -172,4 +174,20 @@ private: } // namepsace arm_compute } // namespace internal +#include + +namespace internal +{ +namespace arm_compute +{ + +// check if this runtime runs on GPU or NEON +bool isGpuMode(); + +#define CAST_CL(tensor) static_cast<::arm_compute::CLTensor *>(tensor) +#define CAST_NE(tensor) static_cast<::arm_compute::Tensor *>(tensor) + +} // namepsace arm_compute +} // namespace internal + #endif // __INTERNAL_ARM_COMPUTE_H__ diff --git a/runtimes/pure_arm_compute/src/internal/arm_compute/Cast.h b/runtimes/pure_arm_compute/src/internal/arm_compute/Cast.h index 4add3d7..486c0af 100644 --- a/runtimes/pure_arm_compute/src/internal/arm_compute/Cast.h +++ b/runtimes/pure_arm_compute/src/internal/arm_compute/Cast.h @@ -235,7 +235,7 @@ inline ::arm_compute::TensorShape asTensorShape(const internal::tflite::operand: } template -void copyCast(const FromT value, ::arm_compute::ICLTensor *to, const ::arm_compute::Coordinates &id) +void copyCast(const FromT value, ::arm_compute::ITensor *to, const ::arm_compute::Coordinates &id) { switch (to->info()->data_type()) { diff --git a/runtimes/pure_arm_compute/src/internal/layers/FeatureLoggingLayer.h b/runtimes/pure_arm_compute/src/internal/layers/FeatureLoggingLayer.h index 470279f..f0dc95d 100644 --- a/runtimes/pure_arm_compute/src/internal/layers/FeatureLoggingLayer.h +++ b/runtimes/pure_arm_compute/src/internal/layers/FeatureLoggingLayer.h @@ -1,16 +1,20 @@ #ifndef __FEATURE_LOGGING_LAYER_H__ #define __FEATURE_LOGGING_LAYER_H__ -#include +#include +#include +#include #include #include #include +#include "internal/arm_compute.h" + class FeatureLoggingLayer : public ::arm_compute::IFunction { public: - void configure(const std::string &tag, ::arm_compute::ICLTensor *target) + void configure(const std::string &tag, ::arm_compute::ITensor *target) { _tag = tag; _target = target; @@ -19,9 +23,11 @@ public: public: void run(void) override { - auto &q = ::arm_compute::CLScheduler::get().queue(); - - _target->map(q); + if (::internal::arm_compute::isGpuMode()) + { + auto &q = ::arm_compute::CLScheduler::get().queue(); + CAST_CL(_target)->map(q); + } const size_t W = _target->info()->dimension(0); const size_t H = _target->info()->dimension(1); @@ -51,12 +57,16 @@ public: std::cout << std::endl; } - _target->unmap(q); + if (::internal::arm_compute::isGpuMode()) + { + auto &q = ::arm_compute::CLScheduler::get().queue(); + CAST_CL(_target)->unmap(q); + } } private: std::string _tag; - ::arm_compute::ICLTensor *_target; + ::arm_compute::ITensor *_target; }; #endif // __FEATURE_LOGGING_LAYER_H__ diff --git a/runtimes/pure_arm_compute/src/internal/layers/GenericReshapeLayer.cc b/runtimes/pure_arm_compute/src/internal/layers/GenericReshapeLayer.cc index b3e4488..ac50dff 100644 --- a/runtimes/pure_arm_compute/src/internal/layers/GenericReshapeLayer.cc +++ b/runtimes/pure_arm_compute/src/internal/layers/GenericReshapeLayer.cc @@ -1,7 +1,7 @@ #include "GenericReshapeLayer.h" +#include "internal/arm_compute.h" -void GenericReshapeLayer::configure(::arm_compute::ICLTensor *input, - ::arm_compute::ICLTensor *output) +void GenericReshapeLayer::configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *output) { _input = input; _output = output; @@ -17,15 +17,34 @@ void GenericReshapeLayer::configure(::arm_compute::ICLTensor *input, // const ::arm_compute::PermutationVector pv{2, 0, 1}; - _permute.configure(input, &_permuted, pv); - _reshape.configure(&_permuted, output); + if (::internal::arm_compute::isGpuMode()) + { + _cl_permute.configure(CAST_CL(input), &_cl_permuted, pv); + _cl_reshape.configure(&_cl_permuted, CAST_CL(output)); - // NOTE _permuted is inaccessible from outside, and thus it is safe to invoke allocate here. - _permuted.allocator()->allocate(); + // NOTE _permuted is inaccessible from outside, and thus it is safe to invoke allocate here. + _cl_permuted.allocator()->allocate(); + } + else + { + _neon_permute.configure(CAST_NE(input), &_neon_permuted, pv); + _neon_reshape.configure(&_neon_permuted, CAST_NE(output)); + + // NOTE _permuted is inaccessible from outside, and thus it is safe to invoke allocate here. + _neon_permuted.allocator()->allocate(); + } } void GenericReshapeLayer::run(void) { - _permute.run(); - _reshape.run(); + if (::internal::arm_compute::isGpuMode()) + { + _cl_permute.run(); + _cl_reshape.run(); + } + else + { + _neon_permute.run(); + _neon_reshape.run(); + } } diff --git a/runtimes/pure_arm_compute/src/internal/layers/GenericReshapeLayer.h b/runtimes/pure_arm_compute/src/internal/layers/GenericReshapeLayer.h index ea6c950..c002f07 100644 --- a/runtimes/pure_arm_compute/src/internal/layers/GenericReshapeLayer.h +++ b/runtimes/pure_arm_compute/src/internal/layers/GenericReshapeLayer.h @@ -1,26 +1,34 @@ #ifndef __GENERIC_RESHAPE_LAYER_H__ #define __GENERIC_RESHAPE_LAYER_H__ +#include #include + #include #include +#include +#include class GenericReshapeLayer : public ::arm_compute::IFunction { public: - void configure(::arm_compute::ICLTensor *input, ::arm_compute::ICLTensor *output); + void configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *output); public: void run(void) override; private: - ::arm_compute::ICLTensor *_input; - ::arm_compute::CLTensor _permuted; - ::arm_compute::ICLTensor *_output; + ::arm_compute::ITensor *_input; + ::arm_compute::ITensor *_output; + ::arm_compute::CLTensor _cl_permuted; + ::arm_compute::Tensor _neon_permuted; private: - ::arm_compute::CLPermute _permute; - ::arm_compute::CLReshapeLayer _reshape; + ::arm_compute::CLPermute _cl_permute; + ::arm_compute::CLReshapeLayer _cl_reshape; + + ::arm_compute::NEPermute _neon_permute; + ::arm_compute::NEReshapeLayer _neon_reshape; }; #endif // __GENERIC_RESHAPE_LAYER_H__ diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleArithmeticAddition.h b/runtimes/pure_arm_compute/src/internal/layers/SimpleArithmeticAddition.h index 2bae649..f6bcfb5 100644 --- a/runtimes/pure_arm_compute/src/internal/layers/SimpleArithmeticAddition.h +++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleArithmeticAddition.h @@ -1,13 +1,14 @@ #ifndef __SIMPLE_ARITHMETIC_ADDITION_H__ #define __SIMPLE_ARITHMETIC_ADDITION_H__ -#include +#include "internal/arm_compute.h" +#include class SimpleArithmeticAddition : public ::arm_compute::IFunction { public: - void configure(::arm_compute::ICLTensor *lhs, ::arm_compute::ICLTensor *rhs, - ::arm_compute::ICLTensor *out) + void configure(::arm_compute::ITensor *lhs, ::arm_compute::ITensor *rhs, + ::arm_compute::ITensor *out) { _lhs = lhs; _rhs = rhs; @@ -17,11 +18,14 @@ public: public: void run(void) override { - auto &q = ::arm_compute::CLScheduler::get().queue(); + if (::internal::arm_compute::isGpuMode()) + { + auto &q = ::arm_compute::CLScheduler::get().queue(); - _lhs->map(q); - _rhs->map(q); - _out->map(q); + CAST_CL(_lhs)->map(q); + CAST_CL(_rhs)->map(q); + CAST_CL(_out)->map(q); + } arm_compute::Window window; window.use_tensor_dimensions(_out->info()->tensor_shape()); @@ -69,15 +73,20 @@ public: } }); - _out->unmap(q); - _rhs->unmap(q); - _lhs->unmap(q); + if (::internal::arm_compute::isGpuMode()) + { + auto &q = ::arm_compute::CLScheduler::get().queue(); + + CAST_CL(_out)->unmap(q); + CAST_CL(_rhs)->unmap(q); + CAST_CL(_lhs)->unmap(q); + } } private: - ::arm_compute::ICLTensor *_lhs; - ::arm_compute::ICLTensor *_rhs; - ::arm_compute::ICLTensor *_out; + ::arm_compute::ITensor *_lhs; + ::arm_compute::ITensor *_rhs; + ::arm_compute::ITensor *_out; }; #endif // __SIMPLE_ARITHMETIC_ADDITION_H__ diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleCastLayer.h b/runtimes/pure_arm_compute/src/internal/layers/SimpleCastLayer.h index 83f3030..5ea56ce 100644 --- a/runtimes/pure_arm_compute/src/internal/layers/SimpleCastLayer.h +++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleCastLayer.h @@ -1,14 +1,15 @@ #ifndef __SIMPLE_CAST_LAYER_H__ #define __SIMPLE_CAST_LAYER_H__ -#include +#include +#include "internal/arm_compute.h" #include "internal/op/Cast.h" class SimpleCastLayer : public ::arm_compute::IFunction { public: - void configure(::arm_compute::ICLTensor *in, ::arm_compute::ICLTensor *out) + void configure(::arm_compute::ITensor *in, ::arm_compute::ITensor *out) { _in = in; _out = out; @@ -17,10 +18,12 @@ public: public: void run(void) override { - auto &q = ::arm_compute::CLScheduler::get().queue(); - - _in->map(q); - _out->map(q); + if (::internal::arm_compute::isGpuMode()) + { + auto &q = ::arm_compute::CLScheduler::get().queue(); + CAST_CL(_in)->map(q); + CAST_CL(_out)->map(q); + } arm_compute::Window window; window.use_tensor_dimensions(_out->info()->tensor_shape()); @@ -28,11 +31,15 @@ public: execute_window_loop(window, [this](const arm_compute::Coordinates &id) { castData(_in, _out, id); }); - _out->unmap(q); - _in->unmap(q); + if (::internal::arm_compute::isGpuMode()) + { + auto &q = ::arm_compute::CLScheduler::get().queue(); + CAST_CL(_out)->unmap(q); + CAST_CL(_in)->unmap(q); + } } - void castData(::arm_compute::ICLTensor *in, ::arm_compute::ICLTensor *out, + void castData(::arm_compute::ITensor *in, ::arm_compute::ITensor *out, const arm_compute::Coordinates &id) { switch (in->info()->data_type()) @@ -65,8 +72,8 @@ public: } private: - ::arm_compute::ICLTensor *_in; - ::arm_compute::ICLTensor *_out; + ::arm_compute::ITensor *_in; + ::arm_compute::ITensor *_out; }; #endif // __SIMPLE_CAST_LAYER_H__ -- 2.7.4