From f916d45179fd382b8f8f30022bba639472b27c30 Mon Sep 17 00:00:00 2001
From: =?utf8?q?Prasanna=20R/System=20SW=20/SRI-Bangalore/Engineer/?=
 =?utf8?q?=EC=82=BC=EC=84=B1=EC=A0=84=EC=9E=90?= <prasanna.r@samsung.com>
Date: Tue, 16 Oct 2018 15:59:10 +0530
Subject: [PATCH] Implement BATCH_TO_SPACE_ND in runtime (#3101)

This patch implements BATCH_TO_SPACE_ND in runtime.

Signed-off-by: prasannar <prasanna.r@samsung.com>
---
 runtimes/pure_arm_compute/src/compilation.cc       |  61 +++++++++-
 .../src/internal/layers/SimpleBatchToSpaceNd.cc    | 131 +++++++++++++++++++++
 .../src/internal/layers/SimpleBatchToSpaceNd.h     |  45 +++++++
 3 files changed, 235 insertions(+), 2 deletions(-)
 create mode 100644 runtimes/pure_arm_compute/src/internal/layers/SimpleBatchToSpaceNd.cc
 create mode 100644 runtimes/pure_arm_compute/src/internal/layers/SimpleBatchToSpaceNd.h
diff --git a/runtimes/pure_arm_compute/src/compilation.cc b/runtimes/pure_arm_compute/src/compilation.cc
index 05a9cba..588a284 100644
--- a/runtimes/pure_arm_compute/src/compilation.cc
+++ b/runtimes/pure_arm_compute/src/compilation.cc
@@ -85,6 +85,7 @@
 #include "internal/layers/SimpleSpaceToDepth.h"
 #include "internal/layers/SimpleEmbeddingLookup.h"
 #include "internal/layers/SimpleDepthToSpace.h"
+#include "internal/layers/SimpleBatchToSpaceNd.h"
 #include "internal/layers/HashtableLookupLayer.h"
 #include "internal/layers/SimpleSpaceToBatchND.h"
 #include "internal/layers/SimpleNeg.h"
@@ -3716,8 +3717,64 @@ void Planner::visit(const ::internal::tflite::op::SpaceToBatchND::Node &node)
 
 void Planner::visit(const ::internal::tflite::op::BatchToSpaceNd::Node &node)
 {
-  // TODO Implement BatchToSpace op
-  throw std::runtime_error("Not supported, yet");
+  const ::internal::tflite::operand::Index output_index{node.param().output_index};
+  const ::internal::tflite::operand::Index input_index{node.param().input_index};
+  const ::internal::tflite::operand::Index block_size_index{node.param().block_size_index};
+
+  assert(_ctx.at(input_index).shape().rank() == 4);
+  assert(_ctx.at(output_index).shape().rank() == 4);
+
+  const int32_t *block_size =
+      reinterpret_cast<const int32_t *>(_ctx.at(block_size_index).data().base());
+
+  const auto &output_shape = _ctx.at(output_index).shape();
+  const auto &input_shape = _ctx.at(input_index).shape();
+
+  assert(block_size[0] > 0 && block_size[1] > 0);
+  {
+    assert(output_shape.dim(3) == input_shape.dim(3));
+    assert(output_shape.dim(1) == input_shape.dim(1) * block_size[0]);
+    assert(output_shape.dim(2) == input_shape.dim(2) * block_size[1]);
+    assert(output_shape.dim(0) == input_shape.dim(0) / (block_size[0] * block_size[1]));
+  }
+
+  // Set Shape Constraints and TensorInfo
+  _builder.addShapeConstr(
+      output_index, asTensorInfo(asTensorShape(output_shape, false), _ctx.at(output_index).type(),
+                                 _ctx.at(output_index).scale(), _ctx.at(output_index).zeroPoint()));
+  _builder.addShapeConstr(
+      input_index, asTensorInfo(asTensorShape(input_shape, false), _ctx.at(input_index).type(),
+                                _ctx.at(input_index).scale(), _ctx.at(input_index).zeroPoint()));
+
+  // Construct operation parameters
+  struct Param
+  {
+    int output_index;
+    int input_index;
+    const int32_t *block_size;
+    int32_t rank;
+  };
+
+  Param param;
+
+  param.output_index = output_index.asInt();
+  param.input_index = input_index.asInt();
+  param.block_size = block_size;
+  param.rank = _ctx.at(input_index).shape().rank();
+
+  auto stage = [param](const IAllocationContext &ctx, IExecutionBuilder &builder) {
+    auto output_alloc = ctx.at(::internal::tflite::operand::Index{param.output_index});
+    auto input_alloc = ctx.at(::internal::tflite::operand::Index{param.input_index});
+
+    auto fn = nnfw::make_unique<SimpleBatchToSpaceND>();
+
+    fn->configure(input_alloc, output_alloc, param.block_size, getARMComputeAxises(param.rank));
+
+    builder.append("BatchToSpaceND", std::move(fn));
+
+  };
+
+  _builder.addStage(stage);
 }
 
 void Planner::visit(const ::internal::tflite::op::L2Normalization::Node &node)
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleBatchToSpaceNd.cc b/runtimes/pure_arm_compute/src/internal/layers/SimpleBatchToSpaceNd.cc
new file mode 100644
index 0000000..d485e8a
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleBatchToSpaceNd.cc
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "internal/layers/SimpleBatchToSpaceNd.h"
+
+#include <arm_compute/runtime/CL/CLScheduler.h>
+
+void SimpleBatchToSpaceND::configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *output,
+                                     const int32_t *block_size,
+                                     const ::arm_compute::Coordinates &axises = {3, 1, 0, 2})
+{
+  const auto rank = axises.num_dimensions();
+  assert(rank == 4);
+
+  for (int i = 0; i < rank; ++i)
+    assert(axises[i] >= 0 && axises[i] < rank);
+
+  _input = input;
+  _output = output;
+  _block_size = block_size;
+  _axises = axises;
+}
+
+inline int32_t Offset4D(const ::arm_compute::TensorShape &shape, int32_t b, int32_t h, int32_t w,
+                        int32_t d, const ::arm_compute::Coordinates &axises)
+{
+  // b, h, w, d >= 0
+  size_t indexes[4];
+  indexes[axises[0]] = b;
+  indexes[axises[1]] = h;
+  indexes[axises[2]] = w;
+  indexes[axises[3]] = d;
+
+  int32_t offset = indexes[3] * shape[2] * shape[1] * shape[0];
+  offset += indexes[2] * shape[1] * shape[0];
+  offset += indexes[1] * shape[0];
+  offset += indexes[0];
+  return offset;
+}
+
+template <typename T>
+inline void BatchToSpaceND(const T *input_data, const ::arm_compute::TensorShape &input_shape,
+                           const int32_t *block_size_data, T *output_data,
+                           const ::arm_compute::TensorShape &output_shape,
+                           const ::arm_compute::Coordinates &axises)
+{
+  const int input_batch = input_shape[axises[0]];
+  const int input_height = input_shape[axises[1]];
+  const int input_width = input_shape[axises[2]];
+
+  const int output_batch = output_shape[axises[0]];
+  const int output_height = output_shape[axises[1]];
+  const int output_width = output_shape[axises[2]];
+  const int depth = output_shape[axises[3]];
+
+  for (int out_b = 0; out_b < output_batch; ++out_b)
+  {
+    for (int out_h = 0; out_h < output_height; ++out_h)
+    {
+      for (int out_w = 0; out_w < output_width; ++out_w)
+      {
+        for (int out_d = 0; out_d < depth; ++out_d)
+        {
+          const int in_d = out_d;
+          const int in_h = out_h / block_size_data[0];
+          const int in_w = out_w / block_size_data[1];
+          const int in_b =
+              out_b +
+              ((out_h % block_size_data[0]) * block_size_data[1] + out_w % block_size_data[1]) *
+                  output_batch;
+
+          const int output_index = Offset4D(output_shape, out_b, out_h, out_w, out_d, axises);
+          const int input_index = Offset4D(input_shape, in_b, in_h, in_w, in_d, axises);
+
+          output_data[output_index] = input_data[input_index];
+        }
+      }
+    }
+  }
+}
+void SimpleBatchToSpaceND::run()
+{
+  if (::internal::arm_compute::isGpuMode())
+  {
+    auto &q = ::arm_compute::CLScheduler::get().queue();
+
+    CAST_CL(_input)->map(q);
+    CAST_CL(_output)->map(q);
+  }
+
+  auto input_buf = _input->buffer();
+  auto output_buf = _output->buffer();
+  switch (_input->info()->data_type())
+  {
+    case ::arm_compute::DataType::U8:
+    case ::arm_compute::DataType::QASYMM8:
+      BatchToSpaceND(reinterpret_cast<const uint8_t *>(input_buf), _input->info()->tensor_shape(),
+                     _block_size, reinterpret_cast<uint8_t *>(output_buf),
+                     _output->info()->tensor_shape(), _axises);
+      break;
+    case ::arm_compute::DataType::F32:
+      BatchToSpaceND(reinterpret_cast<const float *>(input_buf), _input->info()->tensor_shape(),
+                     _block_size, reinterpret_cast<float *>(output_buf),
+                     _output->info()->tensor_shape(), _axises);
+      break;
+    default:
+      ARM_COMPUTE_ERROR("DataType not supported");
+      break;
+  }
+
+  if (::internal::arm_compute::isGpuMode())
+  {
+    auto &q = ::arm_compute::CLScheduler::get().queue();
+
+    CAST_CL(_input)->unmap(q);
+    CAST_CL(_output)->unmap(q);
+  }
+}
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleBatchToSpaceNd.h b/runtimes/pure_arm_compute/src/internal/layers/SimpleBatchToSpaceNd.h
new file mode 100644
index 0000000..52a1d35
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleBatchToSpaceNd.h
@@ -0,0 +1,45 @@
+/*
+ *Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __SIMPLE_BATCH_TO_SPACE_ND_H__
+#define __SIMPLE_BATCH_TO_SPACE_ND_H__
+
+#include "internal/arm_compute.h"
+#include <arm_compute/core/ITensor.h>
+#include <arm_compute/runtime/IFunction.h>
+
+class SimpleBatchToSpaceND : public ::arm_compute::IFunction
+{
+public:
+  /** Initialise input and output
+   *
+   * @param[in]  input       First tensor input.
+   * @param[out] output      Output tensor.
+   * @param[in]  block_size  Block size.
+   */
+  void configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *output,
+                 const int32_t *block_size, const ::arm_compute::Coordinates &axises);
+
+  void run() override;
+
+private:
+  ::arm_compute::ITensor *_input;
+  ::arm_compute::ITensor *_output;
+  const int32_t *_block_size;
+  ::arm_compute::Coordinates _axises;
+};
+
+#endif /*__SIMPLE_BATCH_TO_SPACE_ND_H__ */
-- 
2.7.4