From ec4ea6a482da92f5335fb325d47e245194dfbc72 Mon Sep 17 00:00:00 2001
From: =?utf8?q?=EC=9C=A4=ED=98=84=EC=8B=9D/=EB=8F=99=EC=9E=91=EC=A0=9C?=
 =?utf8?q?=EC=96=B4Lab=28SR=29/Principal=20Engineer/=EC=82=BC=EC=84=B1?=
 =?utf8?q?=EC=A0=84=EC=9E=90?= <hyunsik.yoon@samsung.com>
Date: Mon, 20 Aug 2018 13:43:04 +0900
Subject: [PATCH] [PureACL] Add NEON Operations (#2343)

PureACL was modified to run CL or NEON operations. (Provide "NEON=1" as env variable to run NEON operations)
Three NEON operations were tested: `Add`, `Sub`, `Mul` (8 generated tests were passed).

Signed-off-by: Hyun Sik Yoon <hyunsik.yoon@samsung.com>
---
 runtimes/pure_arm_compute/src/compilation.cc       | 543 +++++++++++++++------
 .../pure_arm_compute/src/internal/arm_compute.cc   |  36 +-
 .../pure_arm_compute/src/internal/arm_compute.h    |  28 +-
 .../src/internal/arm_compute/Cast.h                |   2 +-
 .../src/internal/layers/FeatureLoggingLayer.h      |  24 +-
 .../src/internal/layers/GenericReshapeLayer.cc     |  35 +-
 .../src/internal/layers/GenericReshapeLayer.h      |  20 +-
 .../src/internal/layers/SimpleArithmeticAddition.h |  35 +-
 .../src/internal/layers/SimpleCastLayer.h          |  29 +-
 9 files changed, 554 insertions(+), 198 deletions(-)

diff --git a/runtimes/pure_arm_compute/src/compilation.cc b/runtimes/pure_arm_compute/src/compilation.cc
index d2be05f..7ab6ba9 100644
--- a/runtimes/pure_arm_compute/src/compilation.cc
+++ b/runtimes/pure_arm_compute/src/compilation.cc
@@ -1,7 +1,5 @@
 #include <NeuralNetworks.h>
 
-#include <arm_compute/core/CL/ICLTensor.h>
-
 #include <arm_compute/runtime/IFunction.h>
 #include <arm_compute/runtime/CL/CLScheduler.h>
 #include <arm_compute/runtime/CL/CLSubTensor.h>
@@ -24,6 +22,13 @@
 #include <arm_compute/runtime/CL/functions/CLDequantizationLayer.h>
 #include <arm_compute/runtime/CL/functions/CLReductionMean.h>
 
+#include <arm_compute/runtime/SubTensor.h>
+#include <arm_compute/runtime/NEON/functions/NEArithmeticAddition.h>
+#include <arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h>
+#include <arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h>
+#include <arm_compute/runtime/NEON/functions/NEActivationLayer.h>
+
+#include "internal/arm_compute.h"
 #include "internal/arm_compute/Cast.h"
 #include "internal/arm_compute/matrix/View.h"
 #include "internal/arm_compute/kernel/View.h"
@@ -150,7 +155,7 @@ struct IAllocationContext
 {
   virtual ~IAllocationContext() = default;
 
-  virtual ::arm_compute::ICLTensor *at(const ::internal::tflite::operand::Index &ind) const = 0;
+  virtual ::arm_compute::ITensor *at(const ::internal::tflite::operand::Index &ind) const = 0;
 };
 
 #include "internal/IExecutionBuilder.h"
@@ -271,67 +276,105 @@ public:
   }
 
 private:
-  void appendReLU(::arm_compute::ICLTensor *tensor);
-  void appendReLU6(::arm_compute::ICLTensor *tensor);
-  void appendReLU1(::arm_compute::ICLTensor *tensor);
-  void appendTanh(::arm_compute::ICLTensor *tensor);
+  void appendReLU(::arm_compute::ITensor *tensor);
+  void appendReLU6(::arm_compute::ITensor *tensor);
+  void appendReLU1(::arm_compute::ITensor *tensor);
+  void appendTanh(::arm_compute::ITensor *tensor);
 
 public:
-  void append(FuseCode code, ::arm_compute::ICLTensor *tensor);
+  void append(FuseCode code, ::arm_compute::ITensor *tensor);
 
 private:
   IExecutionBuilder &_builder;
 };
 
-void ActivationBuilder::appendReLU(::arm_compute::ICLTensor *ifm_alloc)
+void ActivationBuilder::appendReLU(::arm_compute::ITensor *ifm_alloc)
 {
   const ::arm_compute::ActivationLayerInfo act_info{
       ::arm_compute::ActivationLayerInfo::ActivationFunction::RELU};
 
-  auto fn = nnfw::make_unique<::arm_compute::CLActivationLayer>();
+  if (::internal::arm_compute::isGpuMode())
+  {
+    auto fn = nnfw::make_unique<::arm_compute::CLActivationLayer>();
 
-  fn->configure(ifm_alloc, nullptr, act_info);
+    fn->configure(CAST_CL(ifm_alloc), nullptr, act_info);
 
-  _builder.append("ReLU", std::move(fn));
+    _builder.append("ReLU", std::move(fn));
+  }
+  else
+  {
+    auto fn = nnfw::make_unique<::arm_compute::NEActivationLayer>();
+
+    fn->configure(ifm_alloc, nullptr, act_info);
+
+    _builder.append("ReLU", std::move(fn));
+  }
 }
 
-void ActivationBuilder::appendReLU1(::arm_compute::ICLTensor *ifm_alloc)
+void ActivationBuilder::appendReLU1(::arm_compute::ITensor *ifm_alloc)
 {
   const ::arm_compute::ActivationLayerInfo act_info{
       ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1.0f, -1.0f};
 
-  auto fn = nnfw::make_unique<::arm_compute::CLActivationLayer>();
+  if (::internal::arm_compute::isGpuMode())
+  {
+    auto fn = nnfw::make_unique<::arm_compute::CLActivationLayer>();
 
-  fn->configure(ifm_alloc, nullptr, act_info);
+    fn->configure(CAST_CL(ifm_alloc), nullptr, act_info);
 
-  _builder.append("ReLU1", std::move(fn));
+    _builder.append("ReLU1", std::move(fn));
+  }
+  else
+  {
+    auto fn = nnfw::make_unique<::arm_compute::NEActivationLayer>();
+
+    fn->configure(ifm_alloc, nullptr, act_info);
+
+    _builder.append("ReLU1", std::move(fn));
+  }
 }
 
-void ActivationBuilder::appendReLU6(::arm_compute::ICLTensor *ifm_alloc)
+void ActivationBuilder::appendReLU6(::arm_compute::ITensor *ifm_alloc)
 {
   const ::arm_compute::ActivationLayerInfo act_info{
       ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.0f, 0.0f};
 
-  auto fn = nnfw::make_unique<::arm_compute::CLActivationLayer>();
+  if (::internal::arm_compute::isGpuMode())
+  {
+    auto fn = nnfw::make_unique<::arm_compute::CLActivationLayer>();
+
+    fn->configure(CAST_CL(ifm_alloc), nullptr, act_info);
+
+    _builder.append("ReLU6", std::move(fn));
+  }
+  else
+  {
+    auto fn = nnfw::make_unique<::arm_compute::NEActivationLayer>();
 
-  fn->configure(ifm_alloc, nullptr, act_info);
+    fn->configure(ifm_alloc, nullptr, act_info);
 
-  _builder.append("ReLU6", std::move(fn));
+    _builder.append("ReLU6", std::move(fn));
+  }
 }
 
-void ActivationBuilder::appendTanh(::arm_compute::ICLTensor *ifm_alloc)
+void ActivationBuilder::appendTanh(::arm_compute::ITensor *ifm_alloc)
 {
   const ::arm_compute::ActivationLayerInfo act_info{
       ::arm_compute::ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f};
 
-  auto fn = nnfw::make_unique<::arm_compute::CLActivationLayer>();
+  if (::internal::arm_compute::isGpuMode())
+  {
+    auto fn = nnfw::make_unique<::arm_compute::CLActivationLayer>();
 
-  fn->configure(ifm_alloc, nullptr, act_info);
+    fn->configure(CAST_CL(ifm_alloc), nullptr, act_info);
 
-  _builder.append("Tanh", std::move(fn));
+    _builder.append("Tanh", std::move(fn));
+  }
+  else
+    throw std::runtime_error("Not supported, yet");
 }
 
-void ActivationBuilder::append(FuseCode code, ::arm_compute::ICLTensor *ifm_alloc)
+void ActivationBuilder::append(FuseCode code, ::arm_compute::ITensor *ifm_alloc)
 {
   switch (code)
   {
@@ -463,18 +506,31 @@ void Planner::visit(const ::internal::tflite::op::Add::Node &node)
 
       auto l = nnfw::make_unique<SimpleArithmeticAddition>();
 
-      l->configure(lhs_alloc, rhs_alloc, ofm_alloc);
+      l->configure(CAST_CL(lhs_alloc), CAST_CL(rhs_alloc), CAST_CL(ofm_alloc));
 
       fn = std::move(l);
     }
     else
     {
-      auto l = nnfw::make_unique<::arm_compute::CLArithmeticAddition>();
+      if (::internal::arm_compute::isGpuMode())
+      {
+        auto l = nnfw::make_unique<::arm_compute::CLArithmeticAddition>();
 
-      // TODO Decide ConvertPolicy (WARP? SATURATE?) according to NN API specification
-      l->configure(lhs_alloc, rhs_alloc, ofm_alloc, ::arm_compute::ConvertPolicy::SATURATE);
+        // TODO Decide ConvertPolicy (WARP? SATURATE?) according to NN API specification
+        l->configure(CAST_CL(lhs_alloc), CAST_CL(rhs_alloc), CAST_CL(ofm_alloc),
+                     ::arm_compute::ConvertPolicy::SATURATE);
 
-      fn = std::move(l);
+        fn = std::move(l);
+      }
+      else // NEON
+      {
+        auto l = nnfw::make_unique<::arm_compute::NEArithmeticAddition>();
+
+        // TODO Decide ConvertPolicy (WARP? SATURATE?) according to NN API specification
+        l->configure(lhs_alloc, rhs_alloc, ofm_alloc, ::arm_compute::ConvertPolicy::SATURATE);
+
+        fn = std::move(l);
+      }
     }
 
     builder.append("Add", std::move(fn));
@@ -523,14 +579,27 @@ void Planner::visit(const ::internal::tflite::op::Sub::Node &node)
     auto lhs_alloc = ctx.at(::internal::tflite::operand::Index{param.lhs_index});
     auto rhs_alloc = ctx.at(::internal::tflite::operand::Index{param.rhs_index});
 
-    auto fn = nnfw::make_unique<::arm_compute::CLArithmeticSubtraction>();
+    if (::internal::arm_compute::isGpuMode())
+    {
+      auto fn = nnfw::make_unique<::arm_compute::CLArithmeticSubtraction>();
+
+      // TODO Decide ConvertPolicy (WARP? SATURATE?) according to NN API specification
+      fn->configure(CAST_CL(lhs_alloc), CAST_CL(rhs_alloc), CAST_CL(ofm_alloc),
+                    ::arm_compute::ConvertPolicy::SATURATE);
+
+      builder.append("Sub", std::move(fn));
+    }
+    else // NEON
+    {
+      auto fn = nnfw::make_unique<::arm_compute::NEArithmeticSubtraction>();
 
-    // TODO Decide ConvertPolicy (WARP? SATURATE?) according to NN API specification
-    fn->configure(lhs_alloc, rhs_alloc, ofm_alloc, ::arm_compute::ConvertPolicy::SATURATE);
+      // TODO Decide ConvertPolicy (WARP? SATURATE?) according to NN API specification
+      fn->configure(lhs_alloc, rhs_alloc, ofm_alloc, ::arm_compute::ConvertPolicy::SATURATE);
 
-    builder.append("Sub", std::move(fn));
+      builder.append("Sub", std::move(fn));
+    }
 
-    ActivationBuilder{builder}.append(param.activation, ofm_alloc);
+    ActivationBuilder{builder}.append(param.activation, CAST_CL(ofm_alloc));
   };
 
   _builder.addStage(stage);
@@ -577,14 +646,27 @@ void Planner::visit(const ::internal::tflite::op::Mul::Node &node)
     auto lhs_input_alloc = ctx.at(::internal::tflite::operand::Index{param.lhs_index});
     auto rhs_input_alloc = ctx.at(::internal::tflite::operand::Index{param.rhs_index});
 
-    auto fn = nnfw::make_unique<::arm_compute::CLPixelWiseMultiplication>();
+    if (::internal::arm_compute::isGpuMode())
+    {
+      auto fn = nnfw::make_unique<::arm_compute::CLPixelWiseMultiplication>();
+
+      fn->configure(CAST_CL(lhs_input_alloc), CAST_CL(rhs_input_alloc), CAST_CL(output_alloc),
+                    1.0, // scale
+                    arm_compute::ConvertPolicy::SATURATE,
+                    arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
 
-    fn->configure(lhs_input_alloc, rhs_input_alloc, output_alloc,
-                  1.0, // scale
-                  arm_compute::ConvertPolicy::SATURATE,
-                  arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+      builder.append("Mul", std::move(fn));
+    }
+    else // NEON
+    {
+      auto fn = nnfw::make_unique<::arm_compute::NEPixelWiseMultiplication>();
 
-    builder.append("Mul", std::move(fn));
+      fn->configure(CAST_NE(lhs_input_alloc), CAST_NE(rhs_input_alloc), CAST_NE(output_alloc),
+                    1.0, // scale
+                    arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_ZERO);
+
+      builder.append("Mul", std::move(fn));
+    }
 
     ActivationBuilder{builder}.append(param.activation, output_alloc);
   };
@@ -654,13 +736,18 @@ void Planner::visit(const ::internal::tflite::op::Div::Node &node)
     auto lhs_alloc = ctx.at(::internal::tflite::operand::Index{param.lhs_index});
     auto rhs_alloc = ctx.at(::internal::tflite::operand::Index{param.rhs_index});
 
-    auto fn = nnfw::make_unique<::arm_compute::CLPixelWiseDivision>();
+    if (::internal::arm_compute::isGpuMode())
+    {
+      auto fn = nnfw::make_unique<::arm_compute::CLPixelWiseDivision>();
 
-    // TODO Decide scale, overflow_policy, and rounding_policy.
-    //      Currently, the default values are used.
-    fn->configure(lhs_alloc, rhs_alloc, ofm_alloc);
+      // TODO Decide scale, overflow_policy, and rounding_policy.
+      //      Currently, the default values are used.
+      fn->configure(CAST_CL(lhs_alloc), CAST_CL(rhs_alloc), CAST_CL(ofm_alloc));
 
-    builder.append("Div", std::move(fn));
+      builder.append("Div", std::move(fn));
+    }
+    else
+      throw std::runtime_error("Not supported, yet");
 
     ActivationBuilder{builder}.append(param.activation, ofm_alloc);
   };
@@ -802,11 +889,17 @@ void Planner::visit(const ::internal::tflite::op::Conv2D::Implicit::Node &node)
 
     const auto conv_info = asPadStringInfo(param.padding, param.stride);
 
-    std::unique_ptr<::arm_compute::CLConvolutionLayer> fn{new ::arm_compute::CLConvolutionLayer};
+    if (::internal::arm_compute::isGpuMode())
+    {
+      std::unique_ptr<::arm_compute::CLConvolutionLayer> fn{new ::arm_compute::CLConvolutionLayer};
 
-    fn->configure(ifm_alloc, ker_alloc, bias_alloc, ofm_alloc, conv_info);
+      fn->configure(CAST_CL(ifm_alloc), CAST_CL(ker_alloc), CAST_CL(bias_alloc), CAST_CL(ofm_alloc),
+                    conv_info);
 
-    builder.append("Conv2D", std::move(fn));
+      builder.append("Conv2D", std::move(fn));
+    }
+    else
+      throw std::runtime_error("Not supported, yet");
 
     ActivationBuilder{builder}.append(param.activation, ofm_alloc);
   };
@@ -957,11 +1050,17 @@ void Planner::visit(const ::internal::tflite::op::Conv2D::Explicit::Node &node)
 
     const auto conv_info = asPadStringInfo(param.padding, param.stride);
 
-    std::unique_ptr<::arm_compute::CLConvolutionLayer> fn{new ::arm_compute::CLConvolutionLayer};
+    if (::internal::arm_compute::isGpuMode())
+    {
+      std::unique_ptr<::arm_compute::CLConvolutionLayer> fn{new ::arm_compute::CLConvolutionLayer};
 
-    fn->configure(ifm_alloc, ker_alloc, bias_alloc, ofm_alloc, conv_info);
+      fn->configure(CAST_CL(ifm_alloc), CAST_CL(ker_alloc), CAST_CL(bias_alloc), CAST_CL(ofm_alloc),
+                    conv_info);
 
-    builder.append("Conv2D", std::move(fn));
+      builder.append("Conv2D", std::move(fn));
+    }
+    else
+      throw std::runtime_error("Not supported, yet");
 
     ActivationBuilder{builder}.append(param.activation, ofm_alloc);
   };
@@ -1082,11 +1181,17 @@ void Planner::visit(const ::internal::tflite::op::DepthwiseConv2D::Implicit::Nod
 
     const auto conv_info = asPadStringInfo(param.padding, param.stride);
 
-    auto fn = nnfw::make_unique<::arm_compute::CLDepthwiseConvolutionLayer>();
+    if (::internal::arm_compute::isGpuMode())
+    {
+      auto fn = nnfw::make_unique<::arm_compute::CLDepthwiseConvolutionLayer>();
 
-    fn->configure(ifm_alloc, ker_alloc, bias_alloc, ofm_alloc, conv_info, param.multipler);
+      fn->configure(CAST_CL(ifm_alloc), CAST_CL(ker_alloc), CAST_CL(bias_alloc), CAST_CL(ofm_alloc),
+                    conv_info, param.multipler);
 
-    builder.append("DepthwiseConv2D", std::move(fn));
+      builder.append("DepthwiseConv2D", std::move(fn));
+    }
+    else
+      throw std::runtime_error("Not supported, yet");
 
     ActivationBuilder{builder}.append(param.activation, ofm_alloc);
   };
@@ -1212,11 +1317,17 @@ void Planner::visit(const ::internal::tflite::op::DepthwiseConv2D::Explicit::Nod
 
     const auto conv_info = asPadStringInfo(param.padding, param.stride);
 
-    auto fn = nnfw::make_unique<::arm_compute::CLDepthwiseConvolutionLayer>();
+    if (::internal::arm_compute::isGpuMode())
+    {
+      auto fn = nnfw::make_unique<::arm_compute::CLDepthwiseConvolutionLayer>();
 
-    fn->configure(ifm_alloc, ker_alloc, bias_alloc, ofm_alloc, conv_info, param.multipler);
+      fn->configure(CAST_CL(ifm_alloc), CAST_CL(ker_alloc), CAST_CL(bias_alloc), CAST_CL(ofm_alloc),
+                    conv_info, param.multipler);
 
-    builder.append("DepthwiseConv2D", std::move(fn));
+      builder.append("DepthwiseConv2D", std::move(fn));
+    }
+    else
+      throw std::runtime_error("Not supported, yet");
 
     ActivationBuilder{builder}.append(param.activation, ofm_alloc);
   };
@@ -1271,13 +1382,17 @@ void Planner::visit(const ::internal::tflite::op::Dequantize::Node &node)
       l->configure(input_alloc, output_alloc);
       fn = std::move(l);
     }
-    else
+    else // Use the OpenCL version of CAST operation
     {
-      // Use the OpenCL version of CAST operation
-      auto l = nnfw::make_unique<::arm_compute::CLCast>();
+      if (::internal::arm_compute::isGpuMode())
+      {
+        auto l = nnfw::make_unique<::arm_compute::CLCast>();
 
-      l->configure(input_alloc, output_alloc);
-      fn = std::move(l);
+        l->configure(CAST_CL(input_alloc), CAST_CL(output_alloc));
+        fn = std::move(l);
+      }
+      else
+        throw std::runtime_error("Not supported, yet");
     }
 
     builder.append("Dequantize", std::move(fn));
@@ -1372,11 +1487,16 @@ void Planner::visit(const ::internal::tflite::op::MaxPool2D::Implicit::Node &nod
                                          ::arm_compute::Size2D{param.kw, param.kh},
                                          asPadStringInfo(param.padding, param.stride)};
 
-    std::unique_ptr<::arm_compute::CLPoolingLayer> fn{new ::arm_compute::CLPoolingLayer};
+    if (::internal::arm_compute::isGpuMode())
+    {
+      std::unique_ptr<::arm_compute::CLPoolingLayer> fn{new ::arm_compute::CLPoolingLayer};
 
-    fn->configure(ifm_alloc, ofm_alloc, info);
+      fn->configure(CAST_CL(ifm_alloc), CAST_CL(ofm_alloc), info);
 
-    builder.append("MaxPool2D", std::move(fn));
+      builder.append("MaxPool2D", std::move(fn));
+    }
+    else
+      throw std::runtime_error("Not supported, yet");
 
     ActivationBuilder{builder}.append(param.activation, ofm_alloc);
   };
@@ -1476,11 +1596,16 @@ void Planner::visit(const ::internal::tflite::op::MaxPool2D::Explicit::Node &nod
                                          ::arm_compute::Size2D{param.kw, param.kh},
                                          asPadStringInfo(param.padding, param.stride)};
 
-    std::unique_ptr<::arm_compute::CLPoolingLayer> fn{new ::arm_compute::CLPoolingLayer};
+    if (::internal::arm_compute::isGpuMode())
+    {
+      std::unique_ptr<::arm_compute::CLPoolingLayer> fn{new ::arm_compute::CLPoolingLayer};
 
-    fn->configure(ifm_alloc, ofm_alloc, info);
+      fn->configure(CAST_CL(ifm_alloc), CAST_CL(ofm_alloc), info);
 
-    builder.append("MaxPool2D", std::move(fn));
+      builder.append("MaxPool2D", std::move(fn));
+    }
+    else
+      throw std::runtime_error("Not supported, yet");
 
     ActivationBuilder{builder}.append(param.activation, ofm_alloc);
   };
@@ -1576,11 +1701,16 @@ void Planner::visit(const ::internal::tflite::op::AvgPool2D::Implicit::Node &nod
         ::arm_compute::PoolingType::AVG, ::arm_compute::Size2D{param.kw, param.kh},
         asPadStringInfo(param.padding, param.stride), true /* exclude_padding */};
 
-    std::unique_ptr<::arm_compute::CLPoolingLayer> fn{new ::arm_compute::CLPoolingLayer};
+    if (::internal::arm_compute::isGpuMode())
+    {
+      std::unique_ptr<::arm_compute::CLPoolingLayer> fn{new ::arm_compute::CLPoolingLayer};
 
-    fn->configure(ifm_alloc, ofm_alloc, info);
+      fn->configure(CAST_CL(ifm_alloc), CAST_CL(ofm_alloc), info);
 
-    builder.append("AvgPool2D", std::move(fn));
+      builder.append("AvgPool2D", std::move(fn));
+    }
+    else
+      throw std::runtime_error("Not supported, yet");
 
     ActivationBuilder{builder}.append(param.activation, ofm_alloc);
   };
@@ -1680,11 +1810,16 @@ void Planner::visit(const ::internal::tflite::op::AvgPool2D::Explicit::Node &nod
         ::arm_compute::PoolingType::AVG, ::arm_compute::Size2D{param.kw, param.kh},
         asPadStringInfo(param.padding, param.stride), true /* exclude_padding */};
 
-    std::unique_ptr<::arm_compute::CLPoolingLayer> fn{new ::arm_compute::CLPoolingLayer};
+    if (::internal::arm_compute::isGpuMode())
+    {
+      std::unique_ptr<::arm_compute::CLPoolingLayer> fn{new ::arm_compute::CLPoolingLayer};
 
-    fn->configure(ifm_alloc, ofm_alloc, info);
+      fn->configure(CAST_CL(ifm_alloc), CAST_CL(ofm_alloc), info);
 
-    builder.append("AvgPool2D", std::move(fn));
+      builder.append("AvgPool2D", std::move(fn));
+    }
+    else
+      throw std::runtime_error("Not supported, yet");
 
     ActivationBuilder{builder}.append(param.activation, ofm_alloc);
   };
@@ -1941,11 +2076,17 @@ void Planner::visit(const ::internal::tflite::op::FullyConnected::Node &node)
     auto weight_alloc = ctx.at(::internal::tflite::operand::Index{param.weight_index});
     auto bias_alloc = ctx.at(::internal::tflite::operand::Index{param.bias_index});
 
-    auto fn = nnfw::make_unique<::arm_compute::CLFullyConnectedLayer>();
+    if (::internal::arm_compute::isGpuMode())
+    {
+      auto fn = nnfw::make_unique<::arm_compute::CLFullyConnectedLayer>();
 
-    fn->configure(input_alloc, weight_alloc, bias_alloc, output_alloc);
+      fn->configure(CAST_CL(input_alloc), CAST_CL(weight_alloc), CAST_CL(bias_alloc),
+                    CAST_CL(output_alloc));
 
-    builder.append("FullyConnected", std::move(fn));
+      builder.append("FullyConnected", std::move(fn));
+    }
+    else
+      throw std::runtime_error("Not supported, yet");
 
     ActivationBuilder{builder}.append(param.activation, output_alloc);
   };
@@ -1989,13 +2130,19 @@ void Planner::visit(const ::internal::tflite::op::ResizeBilinear::Node &node)
     auto ofm_alloc = ctx.at(::internal::tflite::operand::Index{param.ofm_index});
     auto ifm_alloc = ctx.at(::internal::tflite::operand::Index{param.ifm_index});
 
-    auto fn = nnfw::make_unique<::arm_compute::CLScale>();
+    if (::internal::arm_compute::isGpuMode())
+    {
+      auto fn = nnfw::make_unique<::arm_compute::CLScale>();
 
-    fn->configure(ifm_alloc, ofm_alloc, ::arm_compute::InterpolationPolicy::BILINEAR,
-                  ::arm_compute::BorderMode::REPLICATE, ::arm_compute::PixelValue(0.f),
-                  ::arm_compute::SamplingPolicy::TOP_LEFT);
+      fn->configure(CAST_CL(ifm_alloc), CAST_CL(ofm_alloc),
+                    ::arm_compute::InterpolationPolicy::BILINEAR,
+                    ::arm_compute::BorderMode::REPLICATE, ::arm_compute::PixelValue(0.f),
+                    ::arm_compute::SamplingPolicy::TOP_LEFT);
 
-    builder.append("ResizeBilinear", std::move(fn));
+      builder.append("ResizeBilinear", std::move(fn));
+    }
+    else
+      throw std::runtime_error("Not supported, yet");
   };
 
   _builder.addStage(stage);
@@ -2034,12 +2181,17 @@ void Planner::visit(const ::internal::tflite::op::Reshape::Node &node)
     auto output_alloc = ctx.at(::internal::tflite::operand::Index{param.output_index});
     auto input_alloc = ctx.at(::internal::tflite::operand::Index{param.input_index});
 
-    // GenericReshape first apply NCHW->NHWC permutation, and apply reshape
-    auto fn = nnfw::make_unique<GenericReshapeLayer>();
+    if (::internal::arm_compute::isGpuMode())
+    {
+      // GenericReshape first apply NCHW->NHWC permutation, and apply reshape
+      auto fn = nnfw::make_unique<GenericReshapeLayer>();
 
-    fn->configure(input_alloc, output_alloc);
+      fn->configure(CAST_CL(input_alloc), CAST_CL(output_alloc));
 
-    builder.append("Reshape", std::move(fn));
+      builder.append("Reshape", std::move(fn));
+    }
+    else
+      throw std::runtime_error("Not supported, yet");
   };
 
   _builder.addStage(stage);
@@ -2082,10 +2234,15 @@ void Planner::visit(const ::internal::tflite::op::Squeeze::Node &node)
     auto output_alloc = ctx.at(::internal::tflite::operand::Index{param.output_index});
     auto input_alloc = ctx.at(::internal::tflite::operand::Index{param.input_index});
 
-    auto fn = nnfw::make_unique<::arm_compute::CLReshapeLayer>();
-    fn->configure(input_alloc, output_alloc);
+    if (::internal::arm_compute::isGpuMode())
+    {
+      auto fn = nnfw::make_unique<::arm_compute::CLReshapeLayer>();
+      fn->configure(CAST_CL(input_alloc), CAST_CL(output_alloc));
 
-    builder.append("Squeeze", std::move(fn));
+      builder.append("Squeeze", std::move(fn));
+    }
+    else
+      throw std::runtime_error("Not supported, yet");
   };
 
   _builder.addStage(stage);
@@ -2127,11 +2284,16 @@ void Planner::visit(const ::internal::tflite::op::Softmax::Node &node)
     auto output_alloc = ctx.at(::internal::tflite::operand::Index{param.output_index});
     auto input_alloc = ctx.at(::internal::tflite::operand::Index{param.input_index});
 
-    auto fn = nnfw::make_unique<::arm_compute::CLSoftmaxLayer>();
+    if (::internal::arm_compute::isGpuMode())
+    {
+      auto fn = nnfw::make_unique<::arm_compute::CLSoftmaxLayer>();
 
-    fn->configure(input_alloc, output_alloc, param.scale);
+      fn->configure(CAST_CL(input_alloc), CAST_CL(output_alloc), param.scale);
 
-    builder.append("Softmax", std::move(fn));
+      builder.append("Softmax", std::move(fn));
+    }
+    else
+      throw std::runtime_error("Not supported, yet");
   };
 
   _builder.addStage(stage);
@@ -2228,12 +2390,18 @@ void Planner::visit(const ::internal::tflite::op::StridedSlice::Node &node)
     auto endData_alloc = ctx.at(::internal::tflite::operand::Index{param.endData_index});
     auto stridesData_alloc = ctx.at(::internal::tflite::operand::Index{param.stridesData_index});
 
-    auto fn = nnfw::make_unique<::arm_compute::CLStridedSlice>();
+    if (::internal::arm_compute::isGpuMode())
+    {
+      auto fn = nnfw::make_unique<::arm_compute::CLStridedSlice>();
 
-    fn->configure(inputData_alloc, outputData_alloc, startData_alloc, endData_alloc,
-                  stridesData_alloc, param.beginMask, param.endMask, param.shrinkAxisMask);
+      fn->configure(CAST_CL(inputData_alloc), CAST_CL(outputData_alloc), CAST_CL(startData_alloc),
+                    CAST_CL(endData_alloc), CAST_CL(stridesData_alloc), param.beginMask,
+                    param.endMask, param.shrinkAxisMask);
 
-    builder.append("StridedSlice", std::move(fn));
+      builder.append("StridedSlice", std::move(fn));
+    }
+    else
+      throw std::runtime_error("Not supported, yet");
   };
 
   _builder.addStage(stage);
@@ -2288,11 +2456,16 @@ void Planner::visit(const ::internal::tflite::op::ReduceMax::Node &node)
     auto ofm_alloc = ctx.at(::internal::tflite::operand::Index{param.ofm_index});
     auto ifm_alloc = ctx.at(::internal::tflite::operand::Index{param.ifm_index});
 
-    auto fn = nnfw::make_unique<::arm_compute::CLReduceMax>();
+    if (::internal::arm_compute::isGpuMode())
+    {
+      auto fn = nnfw::make_unique<::arm_compute::CLReduceMax>();
 
-    fn->configure(ifm_alloc, param.axis, ofm_alloc);
+      fn->configure(CAST_CL(ifm_alloc), param.axis, CAST_CL(ofm_alloc));
 
-    builder.append("ReduceMax", std::move(fn));
+      builder.append("ReduceMax", std::move(fn));
+    }
+    else
+      throw std::runtime_error("Not supported, yet");
   };
 
   _builder.addStage(stage);
@@ -2341,13 +2514,17 @@ void Planner::visit(const ::internal::tflite::op::Cast::Node &node)
       l->configure(input_alloc, output_alloc);
       fn = std::move(l);
     }
-    else
+    else // Use the OpenCL version of CAST operation
     {
-      // Use the OpenCL version of CAST operation
-      auto l = nnfw::make_unique<::arm_compute::CLCast>();
+      if (::internal::arm_compute::isGpuMode())
+      {
+        auto l = nnfw::make_unique<::arm_compute::CLCast>();
 
-      l->configure(input_alloc, output_alloc);
-      fn = std::move(l);
+        l->configure(CAST_CL(input_alloc), CAST_CL(output_alloc));
+        fn = std::move(l);
+      }
+      else
+        throw std::runtime_error("Not supported, yet");
     }
 
     builder.append("Cast", std::move(fn));
@@ -2403,11 +2580,16 @@ void Planner::visit(const ::internal::tflite::op::TopKV2::Node &node)
     auto indices_alloc = ctx.at(::internal::tflite::operand::Index{param.outputIndices_index});
     auto input_alloc = ctx.at(::internal::tflite::operand::Index{param.inputData_index});
 
-    auto fn = nnfw::make_unique<::arm_compute::CLTopKV2>();
+    if (::internal::arm_compute::isGpuMode())
+    {
+      auto fn = nnfw::make_unique<::arm_compute::CLTopKV2>();
 
-    fn->configure(input_alloc, param.k, values_alloc, indices_alloc);
+      fn->configure(CAST_CL(input_alloc), param.k, CAST_CL(values_alloc), CAST_CL(indices_alloc));
 
-    builder.append("TopKV2", std::move(fn));
+      builder.append("TopKV2", std::move(fn));
+    }
+    else
+      throw std::runtime_error("Not supported, yet");
   };
 
   _builder.addStage(stage);
@@ -2458,12 +2640,17 @@ void Planner::visit(const ::internal::tflite::op::Gather::Node &node)
     auto lhs_alloc = ctx.at(::internal::tflite::operand::Index{param.lhs_index});
     auto rhs_alloc = ctx.at(::internal::tflite::operand::Index{param.rhs_index});
 
-    std::unique_ptr<::arm_compute::IFunction> fn;
+    if (::internal::arm_compute::isGpuMode())
+    {
+      std::unique_ptr<::arm_compute::IFunction> fn;
 
-    auto l = nnfw::make_unique<::arm_compute::CLGather>();
-    l->configure(lhs_alloc, rhs_alloc, ofm_alloc);
-    fn = std::move(l);
-    builder.append("Gather", std::move(fn));
+      auto l = nnfw::make_unique<::arm_compute::CLGather>();
+      l->configure(CAST_CL(lhs_alloc), CAST_CL(rhs_alloc), CAST_CL(ofm_alloc));
+      fn = std::move(l);
+      builder.append("Gather", std::move(fn));
+    }
+    else
+      throw std::runtime_error("Not supported, yet");
   };
 
   _builder.addStage(stage);
@@ -2502,11 +2689,16 @@ void Planner::visit(const ::internal::tflite::op::ReLU::Node &node)
     const ::arm_compute::ActivationLayerInfo act_info{
         ::arm_compute::ActivationLayerInfo::ActivationFunction::RELU};
 
-    auto fn = nnfw::make_unique<::arm_compute::CLActivationLayer>();
+    if (::internal::arm_compute::isGpuMode())
+    {
+      auto fn = nnfw::make_unique<::arm_compute::CLActivationLayer>();
 
-    fn->configure(ifm_alloc, ofm_alloc, act_info);
+      fn->configure(CAST_CL(ifm_alloc), CAST_CL(ofm_alloc), act_info);
 
-    builder.append("ReLU", std::move(fn));
+      builder.append("ReLU", std::move(fn));
+    }
+    else
+      throw std::runtime_error("Not supported, yet");
   };
 
   _builder.addStage(stage);
@@ -2545,11 +2737,16 @@ void Planner::visit(const ::internal::tflite::op::ReLU1::Node &node)
     const ::arm_compute::ActivationLayerInfo act_info{
         ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1.0f, -1.0f};
 
-    auto fn = nnfw::make_unique<::arm_compute::CLActivationLayer>();
+    if (::internal::arm_compute::isGpuMode())
+    {
+      auto fn = nnfw::make_unique<::arm_compute::CLActivationLayer>();
 
-    fn->configure(ifm_alloc, ofm_alloc, act_info);
+      fn->configure(CAST_CL(ifm_alloc), CAST_CL(ofm_alloc), act_info);
 
-    builder.append("ReLU1", std::move(fn));
+      builder.append("ReLU1", std::move(fn));
+    }
+    else
+      throw std::runtime_error("Not supported, yet");
   };
 
   _builder.addStage(stage);
@@ -2588,11 +2785,16 @@ void Planner::visit(const ::internal::tflite::op::ReLU6::Node &node)
     const ::arm_compute::ActivationLayerInfo act_info{
         ::arm_compute::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.0f};
 
-    auto fn = nnfw::make_unique<::arm_compute::CLActivationLayer>();
+    if (::internal::arm_compute::isGpuMode())
+    {
+      auto fn = nnfw::make_unique<::arm_compute::CLActivationLayer>();
 
-    fn->configure(ifm_alloc, ofm_alloc, act_info);
+      fn->configure(CAST_CL(ifm_alloc), CAST_CL(ofm_alloc), act_info);
 
-    builder.append("ReLU6", std::move(fn));
+      builder.append("ReLU6", std::move(fn));
+    }
+    else
+      throw std::runtime_error("Not supported, yet");
   };
 
   _builder.addStage(stage);
@@ -2629,11 +2831,16 @@ void Planner::visit(const ::internal::tflite::op::Tanh::Node &node)
     const ::arm_compute::ActivationLayerInfo act_info{
         ::arm_compute::ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f};
 
-    auto fn = nnfw::make_unique<::arm_compute::CLActivationLayer>();
+    if (::internal::arm_compute::isGpuMode())
+    {
+      auto fn = nnfw::make_unique<::arm_compute::CLActivationLayer>();
 
-    fn->configure(ifm_alloc, ofm_alloc, act_info);
+      fn->configure(CAST_CL(ifm_alloc), CAST_CL(ofm_alloc), act_info);
 
-    builder.append("Tanh", std::move(fn));
+      builder.append("Tanh", std::move(fn));
+    }
+    else
+      throw std::runtime_error("Not supported, yet");
   };
 
   _builder.addStage(stage);
@@ -2672,11 +2879,16 @@ void Planner::visit(const ::internal::tflite::op::Logistic::Node &node)
     const ::arm_compute::ActivationLayerInfo act_info{
         ::arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC};
 
-    auto fn = nnfw::make_unique<::arm_compute::CLActivationLayer>();
+    if (::internal::arm_compute::isGpuMode())
+    {
+      auto fn = nnfw::make_unique<::arm_compute::CLActivationLayer>();
 
-    fn->configure(ifm_alloc, ofm_alloc, act_info);
+      fn->configure(CAST_CL(ifm_alloc), CAST_CL(ofm_alloc), act_info);
 
-    builder.append("Logistic", std::move(fn));
+      builder.append("Logistic", std::move(fn));
+    }
+    else
+      throw std::runtime_error("Not supported, yet");
   };
 
   _builder.addStage(stage);
@@ -2759,11 +2971,16 @@ void Planner::visit(const ::internal::tflite::op::Mean::Node &node)
     auto ofm_alloc = ctx.at(::internal::tflite::operand::Index{param.ofm_index});
     auto ifm_alloc = ctx.at(::internal::tflite::operand::Index{param.ifm_index});
 
-    auto fn = nnfw::make_unique<::arm_compute::CLReductionMean>();
+    if (::internal::arm_compute::isGpuMode())
+    {
+      auto fn = nnfw::make_unique<::arm_compute::CLReductionMean>();
 
-    fn->configure(ifm_alloc, ofm_alloc, param.axis);
+      fn->configure(CAST_CL(ifm_alloc), CAST_CL(ofm_alloc), param.axis);
 
-    builder.append("Mean", std::move(fn));
+      builder.append("Mean", std::move(fn));
+    }
+    else
+      throw std::runtime_error("Not supported, yet");
   };
 
   _builder.addStage(stage);
@@ -2778,7 +2995,7 @@ public:
   }
 
 public:
-  ::arm_compute::ICLTensor *at(const ::internal::tflite::operand::Index &ind) const override
+  ::arm_compute::ITensor *at(const ::internal::tflite::operand::Index &ind) const override
   {
     return _plan.operands().at(ind).ptr();
   }
@@ -2949,10 +3166,10 @@ void PlanBuilder::addStage(const Stage &stage) { _stages.emplace_back(stage); }
 
 void PlanBuilder::finalize(void) const
 {
-  // CLTensor objects to be initialized later
-  std::vector<std::shared_ptr<::arm_compute::CLTensor>> tensors;
+  // ITensor objects to be initialized later
+  std::vector<std::shared_ptr<::arm_compute::ITensor>> tensors;
 
-  // Create CLTensor & CLSubTensor
+  // Create Tensor & CLSubTensor
   auto isAllocated = [this](int ind) {
     const ::internal::tflite::operand::Index operand_index{ind};
     return _plan.operands().exist(operand_index);
@@ -2975,8 +3192,31 @@ void PlanBuilder::finalize(void) const
 
     assert(base_tensor != nullptr);
 
-    auto curr_tensor = std::make_shared<::arm_compute::CLSubTensor>(base_tensor, sub_info.shape(),
-                                                                    sub_info.offset());
+    auto curr_tensor = std::make_shared<::arm_compute::CLSubTensor>(
+        CAST_CL(base_tensor), sub_info.shape(), sub_info.offset());
+
+    _plan.operands().set(::internal::tflite::operand::Index{curr}, curr_tensor);
+  };
+
+  auto setNETensor = [&](int ind) {
+    auto tensor = std::make_shared<::arm_compute::Tensor>();
+
+    tensor->allocator()->init(_tensor_info_ctx.at(ind));
+
+    // NOTE Do NOT allocate here. allocate should be invoked after configure functions
+    _plan.operands().set(::internal::tflite::operand::Index{ind}, tensor);
+    tensors.emplace_back(tensor);
+  };
+
+  auto setNESubTensor = [&](int curr) {
+    const auto &sub_info = *(_subsumption_ctx.find(curr)->second);
+
+    auto base_tensor = _plan.operands().at(sub_info.base()).ptr();
+
+    assert(base_tensor != nullptr);
+
+    auto curr_tensor = std::make_shared<::arm_compute::SubTensor>(base_tensor, sub_info.shape(),
+                                                                  sub_info.offset());
 
     _plan.operands().set(::internal::tflite::operand::Index{curr}, curr_tensor);
   };
@@ -3002,7 +3242,10 @@ void PlanBuilder::finalize(void) const
 
       if (it_s == _subsumption_ctx.end())
       {
-        setCLTensor(curr);
+        if (::internal::arm_compute::isGpuMode())
+          setCLTensor(curr);
+        else
+          setNETensor(curr);
         stack.pop();
         continue;
       }
@@ -3011,7 +3254,10 @@ void PlanBuilder::finalize(void) const
 
       if (isAllocated(sub_info.base().asInt()))
       {
-        setCLSubTensor(curr);
+        if (::internal::arm_compute::isGpuMode())
+          setCLSubTensor(curr);
+        else
+          setNESubTensor(curr);
         stack.pop();
       }
       else
@@ -3030,7 +3276,10 @@ void PlanBuilder::finalize(void) const
       continue;
     }
 
-    setCLTensor(it->first);
+    if (::internal::arm_compute::isGpuMode())
+      setCLTensor(it->first);
+    else
+      setNETensor(it->first);
   }
 
   // Process Stage
@@ -3045,7 +3294,16 @@ void PlanBuilder::finalize(void) const
   // Allocate Tensor Memory
   for (const auto &tensor : tensors)
   {
-    tensor->allocator()->allocate();
+    if (::internal::arm_compute::isGpuMode())
+    {
+      auto cl_tensor = CAST_CL(tensor.get());
+      cl_tensor->allocator()->allocate();
+    }
+    else
+    {
+      auto ne_tensor = CAST_NE(tensor.get());
+      ne_tensor->allocator()->allocate();
+    }
   }
 
   // Fill weight/bias
@@ -3259,7 +3517,8 @@ int ANeuralNetworksCompilation_finish(ANeuralNetworksCompilation *compilation)
     return ANEURALNETWORKS_UNEXPECTED_NULL;
   }
 
-  arm_compute::CLScheduler::get().default_init();
+  if (::internal::arm_compute::isGpuMode())
+    arm_compute::CLScheduler::get().default_init();
 
   const auto &operands = compilation->plan().model().operands();
   const auto &operations = compilation->plan().model().operations();
diff --git a/runtimes/pure_arm_compute/src/internal/arm_compute.cc b/runtimes/pure_arm_compute/src/internal/arm_compute.cc
index 394a64c..689510c 100644
--- a/runtimes/pure_arm_compute/src/internal/arm_compute.cc
+++ b/runtimes/pure_arm_compute/src/internal/arm_compute.cc
@@ -13,11 +13,19 @@ namespace operand
 
 void Object::access(const std::function<void(::arm_compute::ITensor &tensor)> &fn) const
 {
-  auto &queue = ::arm_compute::CLScheduler::get().queue();
+  if (::internal::arm_compute::isGpuMode())
+  {
+    auto &queue = ::arm_compute::CLScheduler::get().queue();
 
-  _tensor->map(queue);
-  fn(*_tensor);
-  _tensor->unmap(queue);
+    auto cl_tensor = _tensor.get();
+    CAST_CL(cl_tensor)->map(queue);
+    fn(*_tensor);
+    CAST_CL(cl_tensor)->unmap(queue);
+  }
+  else
+  {
+    fn(*_tensor);
+  }
 }
 
 } // namespace operand
@@ -32,7 +40,7 @@ namespace operand
 {
 
 Context &Context::set(const ::internal::tflite::operand::Index &id,
-                      const std::shared_ptr<::arm_compute::ICLTensor> &tensor)
+                      const std::shared_ptr<::arm_compute::ITensor> &tensor)
 {
   assert(_objects.find(id.asInt()) == _objects.end());
 
@@ -43,3 +51,21 @@ Context &Context::set(const ::internal::tflite::operand::Index &id,
 } // namespace operand
 } // namepsace arm_compute
 } // namespace internal
+
+namespace internal
+{
+namespace arm_compute
+{
+
+bool isGpuMode()
+{
+  char *neon = std::getenv("NEON");
+  if (neon == nullptr)
+    return true;
+  else if (neon[0] == '1')
+    return false;
+  return true;
+}
+
+} // namepsace arm_compute
+} // namespace internal
diff --git a/runtimes/pure_arm_compute/src/internal/arm_compute.h b/runtimes/pure_arm_compute/src/internal/arm_compute.h
index 8310faf..cacdfce 100644
--- a/runtimes/pure_arm_compute/src/internal/arm_compute.h
+++ b/runtimes/pure_arm_compute/src/internal/arm_compute.h
@@ -1,7 +1,9 @@
 #ifndef __INTERNAL_ARM_COMPUTE_H__
 #define __INTERNAL_ARM_COMPUTE_H__
 
-#include <arm_compute/core/CL/ICLTensor.h>
+#include <arm_compute/core/ITensor.h>
+#include <arm_compute/runtime/CL/CLTensor.h>
+#include <arm_compute/runtime/Tensor.h>
 
 namespace internal
 {
@@ -16,16 +18,16 @@ public:
   Object() = default;
 
 public:
-  Object(const std::shared_ptr<::arm_compute::ICLTensor> &tensor) : _tensor{tensor}
+  Object(const std::shared_ptr<::arm_compute::ITensor> &tensor) : _tensor{tensor}
   {
     // DO NOTHING
   }
 
 public:
-  ::arm_compute::ICLTensor *ptr(void) const { return _tensor.get(); }
+  ::arm_compute::ITensor *ptr(void) const { return _tensor.get(); }
 
 private:
-  std::shared_ptr<::arm_compute::ICLTensor> _tensor;
+  std::shared_ptr<::arm_compute::ITensor> _tensor;
 
 public:
   void access(const std::function<void(::arm_compute::ITensor &tensor)> &fn) const;
@@ -50,7 +52,7 @@ class Context
 {
 public:
   Context &set(const ::internal::tflite::operand::Index &ind,
-               const std::shared_ptr<::arm_compute::ICLTensor> &tensor);
+               const std::shared_ptr<::arm_compute::ITensor> &tensor);
 
 public:
   bool exist(const ::internal::tflite::operand::Index &ind) const
@@ -172,4 +174,20 @@ private:
 } // namepsace arm_compute
 } // namespace internal
 
+#include <arm_compute/core/ITensor.h>
+
+namespace internal
+{
+namespace arm_compute
+{
+
+// check if this runtime runs on GPU or NEON
+bool isGpuMode();
+
+#define CAST_CL(tensor) static_cast<::arm_compute::CLTensor *>(tensor)
+#define CAST_NE(tensor) static_cast<::arm_compute::Tensor *>(tensor)
+
+} // namepsace arm_compute
+} // namespace internal
+
 #endif // __INTERNAL_ARM_COMPUTE_H__
diff --git a/runtimes/pure_arm_compute/src/internal/arm_compute/Cast.h b/runtimes/pure_arm_compute/src/internal/arm_compute/Cast.h
index 4add3d7..486c0af 100644
--- a/runtimes/pure_arm_compute/src/internal/arm_compute/Cast.h
+++ b/runtimes/pure_arm_compute/src/internal/arm_compute/Cast.h
@@ -235,7 +235,7 @@ inline ::arm_compute::TensorShape asTensorShape(const internal::tflite::operand:
 }
 
 template <typename FromT>
-void copyCast(const FromT value, ::arm_compute::ICLTensor *to, const ::arm_compute::Coordinates &id)
+void copyCast(const FromT value, ::arm_compute::ITensor *to, const ::arm_compute::Coordinates &id)
 {
   switch (to->info()->data_type())
   {
diff --git a/runtimes/pure_arm_compute/src/internal/layers/FeatureLoggingLayer.h b/runtimes/pure_arm_compute/src/internal/layers/FeatureLoggingLayer.h
index 470279f..f0dc95d 100644
--- a/runtimes/pure_arm_compute/src/internal/layers/FeatureLoggingLayer.h
+++ b/runtimes/pure_arm_compute/src/internal/layers/FeatureLoggingLayer.h
@@ -1,16 +1,20 @@
 #ifndef __FEATURE_LOGGING_LAYER_H__
 #define __FEATURE_LOGGING_LAYER_H__
 
-#include <arm_compute/core/CL/ICLTensor.h>
+#include <arm_compute/core/ITensor.h>
+#include <arm_compute/runtime/IFunction.h>
+#include <arm_compute/runtime/CL/CLScheduler.h>
 
 #include <iostream>
 #include <iomanip>
 #include <limits>
 
+#include "internal/arm_compute.h"
+
 class FeatureLoggingLayer : public ::arm_compute::IFunction
 {
 public:
-  void configure(const std::string &tag, ::arm_compute::ICLTensor *target)
+  void configure(const std::string &tag, ::arm_compute::ITensor *target)
   {
     _tag = tag;
     _target = target;
@@ -19,9 +23,11 @@ public:
 public:
   void run(void) override
   {
-    auto &q = ::arm_compute::CLScheduler::get().queue();
-
-    _target->map(q);
+    if (::internal::arm_compute::isGpuMode())
+    {
+      auto &q = ::arm_compute::CLScheduler::get().queue();
+      CAST_CL(_target)->map(q);
+    }
 
     const size_t W = _target->info()->dimension(0);
     const size_t H = _target->info()->dimension(1);
@@ -51,12 +57,16 @@ public:
       std::cout << std::endl;
     }
 
-    _target->unmap(q);
+    if (::internal::arm_compute::isGpuMode())
+    {
+      auto &q = ::arm_compute::CLScheduler::get().queue();
+      CAST_CL(_target)->unmap(q);
+    }
   }
 
 private:
   std::string _tag;
-  ::arm_compute::ICLTensor *_target;
+  ::arm_compute::ITensor *_target;
 };
 
 #endif // __FEATURE_LOGGING_LAYER_H__
diff --git a/runtimes/pure_arm_compute/src/internal/layers/GenericReshapeLayer.cc b/runtimes/pure_arm_compute/src/internal/layers/GenericReshapeLayer.cc
index b3e4488..ac50dff 100644
--- a/runtimes/pure_arm_compute/src/internal/layers/GenericReshapeLayer.cc
+++ b/runtimes/pure_arm_compute/src/internal/layers/GenericReshapeLayer.cc
@@ -1,7 +1,7 @@
 #include "GenericReshapeLayer.h"
+#include "internal/arm_compute.h"
 
-void GenericReshapeLayer::configure(::arm_compute::ICLTensor *input,
-                                    ::arm_compute::ICLTensor *output)
+void GenericReshapeLayer::configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *output)
 {
   _input = input;
   _output = output;
@@ -17,15 +17,34 @@ void GenericReshapeLayer::configure(::arm_compute::ICLTensor *input,
   //
   const ::arm_compute::PermutationVector pv{2, 0, 1};
 
-  _permute.configure(input, &_permuted, pv);
-  _reshape.configure(&_permuted, output);
+  if (::internal::arm_compute::isGpuMode())
+  {
+    _cl_permute.configure(CAST_CL(input), &_cl_permuted, pv);
+    _cl_reshape.configure(&_cl_permuted, CAST_CL(output));
 
-  // NOTE _permuted is inaccessible from outside, and thus it is safe to invoke allocate here.
-  _permuted.allocator()->allocate();
+    // NOTE _permuted is inaccessible from outside, and thus it is safe to invoke allocate here.
+    _cl_permuted.allocator()->allocate();
+  }
+  else
+  {
+    _neon_permute.configure(CAST_NE(input), &_neon_permuted, pv);
+    _neon_reshape.configure(&_neon_permuted, CAST_NE(output));
+
+    // NOTE _permuted is inaccessible from outside, and thus it is safe to invoke allocate here.
+    _neon_permuted.allocator()->allocate();
+  }
 }
 
 void GenericReshapeLayer::run(void)
 {
-  _permute.run();
-  _reshape.run();
+  if (::internal::arm_compute::isGpuMode())
+  {
+    _cl_permute.run();
+    _cl_reshape.run();
+  }
+  else
+  {
+    _neon_permute.run();
+    _neon_reshape.run();
+  }
 }
diff --git a/runtimes/pure_arm_compute/src/internal/layers/GenericReshapeLayer.h b/runtimes/pure_arm_compute/src/internal/layers/GenericReshapeLayer.h
index ea6c950..c002f07 100644
--- a/runtimes/pure_arm_compute/src/internal/layers/GenericReshapeLayer.h
+++ b/runtimes/pure_arm_compute/src/internal/layers/GenericReshapeLayer.h
@@ -1,26 +1,34 @@
 #ifndef __GENERIC_RESHAPE_LAYER_H__
 #define __GENERIC_RESHAPE_LAYER_H__
 
+#include <arm_compute/runtime/Tensor.h>
 #include <arm_compute/runtime/CL/CLTensor.h>
+
 #include <arm_compute/runtime/CL/functions/CLPermute.h>
 #include <arm_compute/runtime/CL/functions/CLReshapeLayer.h>
+#include <arm_compute/runtime/NEON/functions/NEPermute.h>
+#include <arm_compute/runtime/NEON/functions/NEReshapeLayer.h>
 
 class GenericReshapeLayer : public ::arm_compute::IFunction
 {
 public:
-  void configure(::arm_compute::ICLTensor *input, ::arm_compute::ICLTensor *output);
+  void configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *output);
 
 public:
   void run(void) override;
 
 private:
-  ::arm_compute::ICLTensor *_input;
-  ::arm_compute::CLTensor _permuted;
-  ::arm_compute::ICLTensor *_output;
+  ::arm_compute::ITensor *_input;
+  ::arm_compute::ITensor *_output;
+  ::arm_compute::CLTensor _cl_permuted;
+  ::arm_compute::Tensor _neon_permuted;
 
 private:
-  ::arm_compute::CLPermute _permute;
-  ::arm_compute::CLReshapeLayer _reshape;
+  ::arm_compute::CLPermute _cl_permute;
+  ::arm_compute::CLReshapeLayer _cl_reshape;
+
+  ::arm_compute::NEPermute _neon_permute;
+  ::arm_compute::NEReshapeLayer _neon_reshape;
 };
 
 #endif // __GENERIC_RESHAPE_LAYER_H__
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleArithmeticAddition.h b/runtimes/pure_arm_compute/src/internal/layers/SimpleArithmeticAddition.h
index 2bae649..f6bcfb5 100644
--- a/runtimes/pure_arm_compute/src/internal/layers/SimpleArithmeticAddition.h
+++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleArithmeticAddition.h
@@ -1,13 +1,14 @@
 #ifndef __SIMPLE_ARITHMETIC_ADDITION_H__
 #define __SIMPLE_ARITHMETIC_ADDITION_H__
 
-#include <arm_compute/core/CL/ICLTensor.h>
+#include "internal/arm_compute.h"
+#include <arm_compute/core/ITensor.h>
 
 class SimpleArithmeticAddition : public ::arm_compute::IFunction
 {
 public:
-  void configure(::arm_compute::ICLTensor *lhs, ::arm_compute::ICLTensor *rhs,
-                 ::arm_compute::ICLTensor *out)
+  void configure(::arm_compute::ITensor *lhs, ::arm_compute::ITensor *rhs,
+                 ::arm_compute::ITensor *out)
   {
     _lhs = lhs;
     _rhs = rhs;
@@ -17,11 +18,14 @@ public:
 public:
   void run(void) override
   {
-    auto &q = ::arm_compute::CLScheduler::get().queue();
+    if (::internal::arm_compute::isGpuMode())
+    {
+      auto &q = ::arm_compute::CLScheduler::get().queue();
 
-    _lhs->map(q);
-    _rhs->map(q);
-    _out->map(q);
+      CAST_CL(_lhs)->map(q);
+      CAST_CL(_rhs)->map(q);
+      CAST_CL(_out)->map(q);
+    }
 
     arm_compute::Window window;
     window.use_tensor_dimensions(_out->info()->tensor_shape());
@@ -69,15 +73,20 @@ public:
       }
     });
 
-    _out->unmap(q);
-    _rhs->unmap(q);
-    _lhs->unmap(q);
+    if (::internal::arm_compute::isGpuMode())
+    {
+      auto &q = ::arm_compute::CLScheduler::get().queue();
+
+      CAST_CL(_out)->unmap(q);
+      CAST_CL(_rhs)->unmap(q);
+      CAST_CL(_lhs)->unmap(q);
+    }
   }
 
 private:
-  ::arm_compute::ICLTensor *_lhs;
-  ::arm_compute::ICLTensor *_rhs;
-  ::arm_compute::ICLTensor *_out;
+  ::arm_compute::ITensor *_lhs;
+  ::arm_compute::ITensor *_rhs;
+  ::arm_compute::ITensor *_out;
 };
 
 #endif // __SIMPLE_ARITHMETIC_ADDITION_H__
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleCastLayer.h b/runtimes/pure_arm_compute/src/internal/layers/SimpleCastLayer.h
index 83f3030..5ea56ce 100644
--- a/runtimes/pure_arm_compute/src/internal/layers/SimpleCastLayer.h
+++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleCastLayer.h
@@ -1,14 +1,15 @@
 #ifndef __SIMPLE_CAST_LAYER_H__
 #define __SIMPLE_CAST_LAYER_H__
 
-#include <arm_compute/core/CL/ICLTensor.h>
+#include <arm_compute/core/ITensor.h>
 
+#include "internal/arm_compute.h"
 #include "internal/op/Cast.h"
 
 class SimpleCastLayer : public ::arm_compute::IFunction
 {
 public:
-  void configure(::arm_compute::ICLTensor *in, ::arm_compute::ICLTensor *out)
+  void configure(::arm_compute::ITensor *in, ::arm_compute::ITensor *out)
   {
     _in = in;
     _out = out;
@@ -17,10 +18,12 @@ public:
 public:
   void run(void) override
   {
-    auto &q = ::arm_compute::CLScheduler::get().queue();
-
-    _in->map(q);
-    _out->map(q);
+    if (::internal::arm_compute::isGpuMode())
+    {
+      auto &q = ::arm_compute::CLScheduler::get().queue();
+      CAST_CL(_in)->map(q);
+      CAST_CL(_out)->map(q);
+    }
 
     arm_compute::Window window;
     window.use_tensor_dimensions(_out->info()->tensor_shape());
@@ -28,11 +31,15 @@ public:
     execute_window_loop(window,
                         [this](const arm_compute::Coordinates &id) { castData(_in, _out, id); });
 
-    _out->unmap(q);
-    _in->unmap(q);
+    if (::internal::arm_compute::isGpuMode())
+    {
+      auto &q = ::arm_compute::CLScheduler::get().queue();
+      CAST_CL(_out)->unmap(q);
+      CAST_CL(_in)->unmap(q);
+    }
   }
 
-  void castData(::arm_compute::ICLTensor *in, ::arm_compute::ICLTensor *out,
+  void castData(::arm_compute::ITensor *in, ::arm_compute::ITensor *out,
                 const arm_compute::Coordinates &id)
   {
     switch (in->info()->data_type())
@@ -65,8 +72,8 @@ public:
   }
 
 private:
-  ::arm_compute::ICLTensor *_in;
-  ::arm_compute::ICLTensor *_out;
+  ::arm_compute::ITensor *_in;
+  ::arm_compute::ITensor *_out;
 };
 
 #endif // __SIMPLE_CAST_LAYER_H__
-- 
2.7.4