From ad6c37d2bb56ad36fca25a73edaf122dec2964dd Mon Sep 17 00:00:00 2001
From: =?utf8?q?Shubham=20Gupta/System=20SW=20/SRI-Bangalore/Engineer/?=
 =?utf8?q?=EC=82=BC=EC=84=B1=EC=A0=84=EC=9E=90?= <shub98.gupta@samsung.com>
Date: Tue, 23 Oct 2018 05:39:55 +0530
Subject: [PATCH] PAD op in PACL as CPU Fallback (#2857)

This patch will add PAD op in PACL as CPU version

Signed-off-by: shubham <shub98.gupta@samsung.com>
---
 runtimes/pure_arm_compute/src/compilation.cc       |  49 ++++---
 .../src/internal/layers/SimplePadLayer.cc          | 142 +++++++++++++++++++++
 .../src/internal/layers/SimplePadLayer.h           |  46 +++++++
 3 files changed, 222 insertions(+), 15 deletions(-)
 create mode 100644 runtimes/pure_arm_compute/src/internal/layers/SimplePadLayer.cc
 create mode 100644 runtimes/pure_arm_compute/src/internal/layers/SimplePadLayer.h
diff --git a/runtimes/pure_arm_compute/src/compilation.cc b/runtimes/pure_arm_compute/src/compilation.cc
index e79b581..d589e04 100644
--- a/runtimes/pure_arm_compute/src/compilation.cc
+++ b/runtimes/pure_arm_compute/src/compilation.cc
@@ -88,9 +88,9 @@
 #include "internal/arm_compute/tensor/View.h"
 #include "internal/layers/GenericReshapeLayer.h"
 #include "internal/layers/SimpleArithmeticAddition.h"
+#include "internal/layers/SimplePadLayer.h"
 #include "internal/layers/SimpleCastLayer.h"
 #include "internal/layers/GenericFullyConnectedLayer.h"
-#include "internal/layers/PadLayer.h"
 #include "internal/layers/SimpleSpaceToDepth.h"
 #include "internal/layers/SimpleEmbeddingLookup.h"
 #include "internal/layers/SimpleDepthToSpace.h"
@@ -3636,6 +3636,10 @@ void Planner::visit(const ::internal::tflite::op::Pad::Node &node)
   const ::internal::tflite::operand::Index ifm_index{node.param().ifm_index};
   const ::internal::tflite::operand::Index paddings_index{node.param().paddings_index};
 
+  const auto paddings_shape = _ctx.at(paddings_index).shape().asTensor();
+
+  assert(_ctx.at(paddings_index).hasData() == true);
+
   // Set Shape Constraints and TensorInfo
   _builder.addShapeConstr(
       ifm_index, asTensorInfo(asTensorShape(_ctx.at(ifm_index).shape()), _ctx.at(ifm_index).type(),
@@ -3648,37 +3652,52 @@ void Planner::visit(const ::internal::tflite::op::Pad::Node &node)
       asTensorInfo(asTensorShape(_ctx.at(paddings_index).shape()), _ctx.at(paddings_index).type(),
                    _ctx.at(paddings_index).scale(), _ctx.at(paddings_index).zeroPoint()));
 
+  // initializer for padding
+  {
+    auto pad_type = _ctx.at(paddings_index).type();
+
+    if (pad_type == ANEURALNETWORKS_TENSOR_INT32)
+    {
+      auto pad_base = _ctx.at(paddings_index).data().base();
+      auto pad_size = _ctx.at(paddings_index).data().size();
+      auto pad_shape = _ctx.at(paddings_index).shape().asMatrix();
+
+      // Supported padding for height and width only.
+      auto initializer = std::bind(initMatrixTensor<int32_t>, _1, pad_shape, pad_base, pad_size);
+      _builder.addInitializer(paddings_index, initializer);
+    }
+    else
+    {
+      throw std::runtime_error("Only Int32 datatype is supported for Pad values");
+    }
+  }
+
   // Construct operation parameters
   struct Param
   {
     int ofm_index;
     int ifm_index;
-    int32_t padding_size;
+    int padding_index;
   };
 
   Param param;
 
   param.ofm_index = ofm_index.asInt();
   param.ifm_index = ifm_index.asInt();
-
-  assert(_ctx.at(paddings_index).hasData() == true);
-
-  // TODO: Currently we are supporting uniform padding for the tensor, so only a single
-  //      value is being read. (TOP = BOTTOM = LEFT = RIGHT).
-  //      Need to read padding values for all the sides (TOP, BOTTOM, LEFT & RIGHT)
-
-  const auto &padding_data = _ctx.at(paddings_index).data();
-  auto base = padding_data.base();
-  auto padsize = reinterpret_cast<const int *>(base) + 3;
-  param.padding_size = *padsize;
+  param.padding_index = paddings_index.asInt();
 
   auto stage = [param](const IAllocationContext &ctx, IExecutionBuilder &builder) {
     auto ofm_alloc = ctx.at(::internal::tflite::operand::Index{param.ofm_index});
     auto ifm_alloc = ctx.at(::internal::tflite::operand::Index{param.ifm_index});
+    auto pad_alloc = ctx.at(::internal::tflite::operand::Index{param.padding_index});
+
+    auto fn = nnfw::make_unique<SimplePadLayer>();
+
+    // only 4d Tensors are supported
+    int rank = 4;
 
-    auto fn = nnfw::make_unique<PadLayer>();
+    fn->configure(ifm_alloc, ofm_alloc, pad_alloc, getARMComputeAxises(rank));
 
-    fn->configure(ifm_alloc, ofm_alloc, param.padding_size);
     builder.append("Pad", std::move(fn));
 
   };
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimplePadLayer.cc b/runtimes/pure_arm_compute/src/internal/layers/SimplePadLayer.cc
new file mode 100644
index 0000000..65bb512
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/SimplePadLayer.cc
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "internal/layers/SimplePadLayer.h"
+#include <arm_compute/runtime/CL/CLScheduler.h>
+
+void SimplePadLayer::configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *output,
+                               ::arm_compute::ITensor *padding_size,
+                               const ::arm_compute::Coordinates &axises)
+{
+
+  const auto rank = axises.num_dimensions();
+  assert(rank == 4);
+  assert(input != nullptr && output != nullptr && padding_size != nullptr);
+
+  for (int i = 0; i < rank; ++i)
+  {
+    assert(axises[i] >= 0);
+    assert(axises[i] < rank);
+  }
+
+  _input = input;
+  _output = output;
+  _padding_size = padding_size;
+  _axises = axises;
+}
+
+template <typename T>
+inline void ApplyPadding(const ::arm_compute::ITensor *input_data,
+                         const ::arm_compute::TensorShape &input_shape,
+                         const ::arm_compute::ITensor *padding_size,
+                         ::arm_compute::ITensor *output_data,
+                         const ::arm_compute::TensorShape &output_shape,
+                         const ::arm_compute::Coordinates &axises)
+{
+  const int input_height = input_shape[axises[1]];
+  const int input_width = input_shape[axises[2]];
+
+  const int batch = output_shape[axises[0]];
+  const int output_height = output_shape[axises[1]];
+  const int output_width = output_shape[axises[2]];
+  const int depth = output_shape[axises[3]];
+
+  // Supports only Spatial padding
+  // Padding size for top, bottom, left and right are required.
+  auto pad_top = *reinterpret_cast<const int32_t *>(padding_size->ptr_to_element({0, 1}));
+  auto pad_bottom = *reinterpret_cast<const int32_t *>(padding_size->ptr_to_element({1, 1}));
+  auto pad_left = *reinterpret_cast<const int32_t *>(padding_size->ptr_to_element({0, 2}));
+  auto pad_right = *reinterpret_cast<const int32_t *>(padding_size->ptr_to_element({1, 2}));
+
+  const int padded_height = input_height + pad_top + pad_bottom;
+  const int padded_width = input_width + pad_left + pad_right;
+
+  { // new block for assertions
+    assert(input_shape[axises[0]] == output_shape[axises[0]]);
+    assert(padded_height == output_height);
+    assert(padded_width == output_width);
+    assert(input_shape[axises[3]] == output_shape[axises[3]]);
+  }
+
+  for (int in_b = 0; in_b < batch; ++in_b)
+  {
+    for (int in_h = 0; in_h < padded_height; ++in_h)
+    {
+      for (int in_w = 0; in_w < padded_width; ++in_w)
+      {
+        for (int in_d = 0; in_d < depth; ++in_d)
+        {
+          const int out_d = in_d;
+          const int out_h = in_h;
+          const int out_w = in_w;
+          const int out_b = in_b;
+
+          auto output_id = asARMComputeCoordinates(
+              ::arm_compute::Coordinates{out_b, out_h, out_w, out_d}, axises);
+
+          if (in_h < pad_top || in_h >= (input_height + pad_top) || in_w < pad_left ||
+              in_w >= (pad_left + input_width))
+          {
+            *reinterpret_cast<T *>(output_data->ptr_to_element(output_id)) = 0;
+          }
+          else
+          {
+            auto input_id = asARMComputeCoordinates(
+                ::arm_compute::Coordinates{in_b, in_h - pad_top, in_w - pad_left, in_d}, axises);
+            *reinterpret_cast<T *>(output_data->ptr_to_element(output_id)) =
+                *reinterpret_cast<T *>(input_data->ptr_to_element(input_id));
+          }
+        }
+      }
+    }
+  }
+}
+void SimplePadLayer::run()
+{
+  if (::internal::arm_compute::isGpuMode())
+  {
+    auto &q = ::arm_compute::CLScheduler::get().queue();
+
+    CAST_CL(_input)->map(q);
+    CAST_CL(_output)->map(q);
+    CAST_CL(_padding_size)->map(q);
+  }
+
+  switch (_input->info()->data_type())
+  {
+    case ::arm_compute::DataType::U8:
+    case ::arm_compute::DataType::QASYMM8:
+      ApplyPadding<uint8_t>(_input, _input->info()->tensor_shape(), _padding_size, _output,
+                            _output->info()->tensor_shape(), _axises);
+      break;
+    case ::arm_compute::DataType::F32:
+      ApplyPadding<float>(_input, _input->info()->tensor_shape(), _padding_size, _output,
+                          _output->info()->tensor_shape(), _axises);
+      break;
+    default:
+      ARM_COMPUTE_ERROR("DataType not supported");
+      break;
+  }
+
+  if (::internal::arm_compute::isGpuMode())
+  {
+    auto &q = ::arm_compute::CLScheduler::get().queue();
+
+    CAST_CL(_input)->unmap(q);
+    CAST_CL(_output)->unmap(q);
+    CAST_CL(_padding_size)->unmap(q);
+  }
+}
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimplePadLayer.h b/runtimes/pure_arm_compute/src/internal/layers/SimplePadLayer.h
new file mode 100644
index 0000000..e636a7c
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/SimplePadLayer.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __SIMPLE_PAD_LAYER_H__
+#define __SIMPLE_PAD_LAYER_H__
+
+#include "internal/arm_compute.h"
+#include "internal/arm_compute/Cast.h"
+#include <arm_compute/core/ITensor.h>
+#include <arm_compute/runtime/IFunction.h>
+
+class SimplePadLayer : public ::arm_compute::IFunction
+{
+public:
+  SimplePadLayer(void) : _input(nullptr), _output(nullptr), _padding_size(nullptr), _axises{}
+  {
+    // DO NOTHING
+  }
+
+  void configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *output,
+                 ::arm_compute::ITensor *padding_size,
+                 const ::arm_compute::Coordinates &axises = getARMComputeAxises(4));
+
+  void run(void) override;
+
+private:
+  ::arm_compute::ITensor *_input;
+  ::arm_compute::ITensor *_output;
+  ::arm_compute::ITensor *_padding_size;
+  ::arm_compute::Coordinates _axises;
+};
+
+#endif // __SIMPLE_PAD_LAYER_H__
-- 
2.7.4