Implement SPACE_TO_DEPTH operation for CPU (#2614)

author 장지섭/동작제어Lab(SR)/Engineer/삼성전자 <jiseob.jang@samsung.com>

Thu, 6 Sep 2018 05:33:29 +0000 (14:33 +0900)

committer 이춘석/동작제어Lab(SR)/Staff Engineer/삼성전자 <chunseok.lee@samsung.com>

Thu, 6 Sep 2018 05:33:29 +0000 (14:33 +0900)
author 장지섭/동작제어Lab(SR)/Engineer/삼성전자 <jiseob.jang@samsung.com>
Thu, 6 Sep 2018 05:33:29 +0000 (14:33 +0900)
committer 이춘석/동작제어Lab(SR)/Staff Engineer/삼성전자 <chunseok.lee@samsung.com>
Thu, 6 Sep 2018 05:33:29 +0000 (14:33 +0900)
diff --git a/runtimes/pure_arm_compute/src/compilation.cc b/runtimes/pure_arm_compute/src/compilation.cc

index a02cb3b..0e21d8e 100644 (file)
--- a/runtimes/pure_arm_compute/src/compilation.cc
+++ b/runtimes/pure_arm_compute/src/compilation.cc
@@ -52,6 +52,7 @@
  #include "internal/layers/SimpleCastLayer.h"
  #include "internal/layers/GenericFullyConnectedLayer.h"
  #include "internal/layers/PadLayer.h"
+#include "internal/layers/SimpleSpaceToDepth.h"
  
  #include "util/matrix/IndexIterator.h"
  #include "util/kernel/IndexIterator.h"
@@ -3365,8 +3366,47 @@ void Planner::visit(const ::internal::tflite::op::Pad::Node &node)
  
  void Planner::visit(const ::internal::tflite::op::SpaceToDepth::Node &node)
  {
-  // TODO Implement SPACE_TO_DEPTH op
-  throw std::runtime_error("Not supported, yet");
+  const ::internal::tflite::operand::Index output_index{node.param().output_index};
+  const ::internal::tflite::operand::Index input_index{node.param().input_index};
+  const ::internal::tflite::operand::Index block_size_index{node.param().block_size_index};
+
+  // Set Shape Constraints and TensorInfo
+  _builder.addShapeConstr(output_index,
+                          asTensorInfo(asTensorShape(_ctx.at(output_index).shape(), false),
+                                       _ctx.at(output_index).type(), _ctx.at(output_index).scale(),
+                                       _ctx.at(output_index).zeroPoint()));
+  _builder.addShapeConstr(input_index,
+                          asTensorInfo(asTensorShape(_ctx.at(input_index).shape(), false),
+                                       _ctx.at(input_index).type(), _ctx.at(input_index).scale(),
+                                       _ctx.at(input_index).zeroPoint()));
+
+  // Construct operation parameters
+  struct Param
+  {
+    int output_index;
+    int input_index;
+    int32_t block_size;
+  };
+
+  Param param;
+
+  param.output_index = output_index.asInt();
+  param.input_index = input_index.asInt();
+  param.block_size = _ctx.at(block_size_index).asScalar<int32_t>();
+
+  auto stage = [param](const IAllocationContext &ctx, IExecutionBuilder &builder) {
+    auto output_alloc = ctx.at(::internal::tflite::operand::Index{param.output_index});
+    auto input_alloc = ctx.at(::internal::tflite::operand::Index{param.input_index});
+    auto rank = 4;
+
+    auto fn = nnfw::make_unique<SimpleSpaceToDepth>();
+
+    fn->configure(input_alloc, output_alloc, param.block_size, getARMComputeAxises(rank));
+    builder.append("SpaceToDepth", std::move(fn));
+
+  };
+
+  _builder.addStage(stage);
  }
  
  class AllocationContext final : public IAllocationContext
diff --git a/runtimes/pure_arm_compute/src/internal/arm_compute/Cast.h b/runtimes/pure_arm_compute/src/internal/arm_compute/Cast.h

index d4c5dd0..8dc9821 100644 (file)
--- a/runtimes/pure_arm_compute/src/internal/arm_compute/Cast.h
+++ b/runtimes/pure_arm_compute/src/internal/arm_compute/Cast.h
@@ -5,6 +5,20 @@
  #include "internal/Swizzle.h"
  #include "internal/Model.h"
  
+inline ::arm_compute::Coordinates getARMComputeAxises(uint32_t rank)
+{
+  ::arm_compute::Coordinates res{};
+
+  res.set_num_dimensions(rank);
+
+  for (uint32_t axis = 0; axis < rank; ++axis)
+  {
+    res.set(axis, ToARMComputeAxis(rank, axis).value());
+  }
+
+  return res;
+}
+
  inline ::arm_compute::TensorShape asTensorShape(const internal::tflite::operand::Shape &shape,
                                                  bool apply_dim_correction = true)
  {
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleSpaceToDepth.cc b/runtimes/pure_arm_compute/src/internal/layers/SimpleSpaceToDepth.cc

new file mode 100644 (file)

index 0000000..d60ba83
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleSpaceToDepth.cc
@@ -0,0 +1,139 @@
+#include "internal/layers/SimpleSpaceToDepth.h"
+
+#include <arm_compute/runtime/CL/CLScheduler.h>
+
+void SimpleSpaceToDepth::configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *output,
+                                   int32_t block_size,
+                                   const ::arm_compute::Coordinates &axises = {3, 1, 0, 2})
+{
+  assert(input->info()->num_dimensions() == 4);
+  assert(output->info()->num_dimensions() == 4);
+  const auto rank = axises.num_dimensions();
+  assert(rank == 4);
+  for (int i = 0; i < rank; ++i)
+  {
+    assert(axises[i] >= 0);
+    assert(axises[i] < rank);
+  }
+
+  _input = input;
+  _output = output;
+  _block_size = block_size;
+  _axises = axises;
+}
+
+inline int32_t Offset4D(const ::arm_compute::TensorShape &shape, int32_t b, int32_t h, int32_t w,
+                        int32_t d, const ::arm_compute::Coordinates &axises)
+{
+  // b, h, w, d >= 0
+  size_t indexes[4];
+  indexes[axises[0]] = b;
+  indexes[axises[1]] = h;
+  indexes[axises[2]] = w;
+  indexes[axises[3]] = d;
+
+  int32_t offset = indexes[3] * shape[2] * shape[1] * shape[0];
+  offset += indexes[2] * shape[1] * shape[0];
+  offset += indexes[1] * shape[0];
+  offset += indexes[0];
+  return offset;
+}
+
+template <typename T>
+inline void SpaceToDepth(const T *input_data, const ::arm_compute::TensorShape &input_shape,
+                         int32_t block_size, T *output_data,
+                         const ::arm_compute::TensorShape &output_shape,
+                         const ::arm_compute::Coordinates &axises)
+{
+  const int input_batch = input_shape[axises[0]];
+  const int input_height = input_shape[axises[1]];
+  const int input_width = input_shape[axises[2]];
+  const int input_depth = input_shape[axises[3]];
+
+  const int output_batch = output_shape[axises[0]];
+  const int output_height = output_shape[axises[1]];
+  const int output_width = output_shape[axises[2]];
+  const int output_depth = output_shape[axises[3]];
+
+  assert(input_batch == output_batch);
+  assert(input_height == output_height * block_size);
+  assert(input_width == output_width * block_size);
+  assert(input_depth * block_size * block_size == output_depth);
+
+  for (int in_b = 0; in_b < input_batch; ++in_b)
+  {
+    for (int in_h = 0; in_h < input_height; ++in_h)
+    {
+      for (int in_w = 0; in_w < input_width; ++in_w)
+      {
+        for (int in_d = 0; in_d < input_depth; ++in_d)
+        {
+          const int out_b = in_b;
+          const int out_h = in_h / block_size;
+          const int out_w = in_w / block_size;
+          const int out_d =
+              in_d + ((in_h % block_size) * block_size + in_w % block_size) * input_depth;
+
+          const int input_index = Offset4D(input_shape, in_b, in_h, in_w, in_d, axises);
+          const int output_index = Offset4D(output_shape, out_b, out_h, out_w, out_d, axises);
+
+          output_data[output_index] = input_data[input_index];
+        }
+      }
+    }
+  }
+}
+
+void SimpleSpaceToDepth::run()
+{
+  if (::internal::arm_compute::isGpuMode())
+  {
+    auto &q = ::arm_compute::CLScheduler::get().queue();
+
+    CAST_CL(_input)->map(q);
+    CAST_CL(_output)->map(q);
+  }
+
+  auto input_buf = _input->buffer();
+  auto output_buf = _output->buffer();
+  switch (_input->info()->data_type())
+  {
+    case ::arm_compute::DataType::U8:
+    case ::arm_compute::DataType::QASYMM8:
+      SpaceToDepth(reinterpret_cast<const uint8_t *>(input_buf), _input->info()->tensor_shape(),
+                   _block_size, reinterpret_cast<uint8_t *>(output_buf),
+                   _output->info()->tensor_shape(), _axises);
+      break;
+    case ::arm_compute::DataType::S8:
+      SpaceToDepth(reinterpret_cast<const int8_t *>(input_buf), _input->info()->tensor_shape(),
+                   _block_size, reinterpret_cast<int8_t *>(output_buf),
+                   _output->info()->tensor_shape(), _axises);
+      break;
+    case ::arm_compute::DataType::U32:
+      SpaceToDepth(reinterpret_cast<const uint32_t *>(input_buf), _input->info()->tensor_shape(),
+                   _block_size, reinterpret_cast<uint32_t *>(output_buf),
+                   _output->info()->tensor_shape(), _axises);
+      break;
+    case ::arm_compute::DataType::S32:
+      SpaceToDepth(reinterpret_cast<const int32_t *>(input_buf), _input->info()->tensor_shape(),
+                   _block_size, reinterpret_cast<int32_t *>(output_buf),
+                   _output->info()->tensor_shape(), _axises);
+      break;
+    case ::arm_compute::DataType::F32:
+      SpaceToDepth(reinterpret_cast<const float *>(input_buf), _input->info()->tensor_shape(),
+                   _block_size, reinterpret_cast<float *>(output_buf),
+                   _output->info()->tensor_shape(), _axises);
+      break;
+    default:
+      ARM_COMPUTE_ERROR("DataType not supported");
+      break;
+  }
+
+  if (::internal::arm_compute::isGpuMode())
+  {
+    auto &q = ::arm_compute::CLScheduler::get().queue();
+
+    CAST_CL(_input)->unmap(q);
+    CAST_CL(_output)->unmap(q);
+  }
+}
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleSpaceToDepth.h b/runtimes/pure_arm_compute/src/internal/layers/SimpleSpaceToDepth.h

new file mode 100644 (file)

index 0000000..92c374f
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleSpaceToDepth.h
@@ -0,0 +1,29 @@
+#ifndef __SIMPLE_SPACE_TO_DEPTH_H__
+#define __SIMPLE_SPACE_TO_DEPTH_H__
+
+#include "internal/arm_compute.h"
+#include <arm_compute/core/ITensor.h>
+#include <arm_compute/runtime/IFunction.h>
+
+class SimpleSpaceToDepth : public ::arm_compute::IFunction
+{
+public:
+  /** Initialise input and output
+   *
+   * @param[in]  input       First tensor input.
+   * @param[out] output      Output tensor.
+   * @param[in]  block_size  Block size.
+   */
+  void configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *output, int32_t block_size,
+                 const ::arm_compute::Coordinates &axises);
+
+  void run() override;
+
+private:
+  ::arm_compute::ITensor *_input;
+  ::arm_compute::ITensor *_output;
+  int32_t _block_size;
+  ::arm_compute::Coordinates _axises;
+};
+
+#endif /*__SIMPLE_SPACE_TO_DEPTH_H__ */
author	장지섭/동작제어Lab(SR)/Engineer/삼성전자 <jiseob.jang@samsung.com>
	Thu, 6 Sep 2018 05:33:29 +0000 (14:33 +0900)
committer	이춘석/동작제어Lab(SR)/Staff Engineer/삼성전자 <chunseok.lee@samsung.com>
	Thu, 6 Sep 2018 05:33:29 +0000 (14:33 +0900)
runtimes/pure_arm_compute/src/compilation.cc		patch \| blob \| history
runtimes/pure_arm_compute/src/internal/arm_compute/Cast.h		patch \| blob \| history
runtimes/pure_arm_compute/src/internal/layers/SimpleSpaceToDepth.cc	[new file with mode: 0644]	patch \| blob
runtimes/pure_arm_compute/src/internal/layers/SimpleSpaceToDepth.h	[new file with mode: 0644]	patch \| blob