From 75c9ba313fbfe959ad3c456d11089064fad66968 Mon Sep 17 00:00:00 2001
From: =?utf8?q?=EB=B0=95=EC=A2=85=ED=98=84/=EB=8F=99=EC=9E=91=EC=A0=9C?=
 =?utf8?q?=EC=96=B4Lab=28SR=29/Staff=20Engineer/=EC=82=BC=EC=84=B1?=
 =?utf8?q?=EC=A0=84=EC=9E=90?= <jh1302.park@samsung.com>
Date: Tue, 12 Jun 2018 16:53:43 +0900
Subject: [PATCH] [Pure CL] Support DepthwiseConv2D (#1661)

This commit introduces DpethwiseConv2D operation support in pure CL
runtime.

Signed-off-by: Jonghyun Park <jh1302.park@samsung.com>
---
 runtimes/pure_arm_compute/src/compilation.cc       | 121 +++++++++++++++++++++
 .../src/internal/op/DepthwiseConv2D.cc             |  67 ++++++++++++
 .../src/internal/op/DepthwiseConv2D.h              |  65 +++++++++++
 .../pure_arm_compute/src/internal/op/NodeVisitor.h |   2 +
 runtimes/pure_arm_compute/src/model.cc             |  25 +++++
 5 files changed, 280 insertions(+)
 create mode 100644 runtimes/pure_arm_compute/src/internal/op/DepthwiseConv2D.cc
 create mode 100644 runtimes/pure_arm_compute/src/internal/op/DepthwiseConv2D.h
diff --git a/runtimes/pure_arm_compute/src/compilation.cc b/runtimes/pure_arm_compute/src/compilation.cc
index cb887d2..4bf4d96 100644
--- a/runtimes/pure_arm_compute/src/compilation.cc
+++ b/runtimes/pure_arm_compute/src/compilation.cc
@@ -20,6 +20,7 @@
 #include <arm_compute/runtime/CL/functions/CLTopKV2.h>
 #include <arm_compute/runtime/CL/functions/CLReduceMax.h>
 #include <arm_compute/runtime/CL/functions/CLCast.h>
+#include <arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h>
 
 #include "internal/arm_compute/Cast.h"
 #include "internal/arm_compute/kernel/View.h"
@@ -301,6 +302,7 @@ public:
   void visit(const ::internal::tflite::op::Mul::Node &node) override;
   void visit(const ::internal::tflite::op::Div::Node &node) override;
   void visit(const ::internal::tflite::op::Conv2D::implicit::Node &node) override;
+  void visit(const ::internal::tflite::op::DepthwiseConv2D::implicit::Node &node) override;
   void visit(const ::internal::tflite::op::MaxPool2D::implicit::Node &node) override;
   void visit(const ::internal::tflite::op::AvgPool2D::implicit::Node &node) override;
   void visit(const ::internal::tflite::op::Concat::Node &node) override;
@@ -798,6 +800,125 @@ void Planner::visit(const ::internal::tflite::op::Conv2D::implicit::Node &node)
   _builder.addStage(stage);
 }
 
+void Planner::visit(const ::internal::tflite::op::DepthwiseConv2D::implicit::Node &node)
+{
+  const ::internal::tflite::operand::Index ofm_index{node.param().ofm_index};
+
+  const ::internal::tflite::operand::Index ifm_index{node.param().ifm_index};
+  const ::internal::tflite::operand::Index ker_index{node.param().ker_index};
+  const ::internal::tflite::operand::Index bias_index{node.param().bias_index};
+
+  const ::internal::tflite::operand::Index vstride_index{node.param().vstride_index};
+  const ::internal::tflite::operand::Index hstride_index{node.param().hstride_index};
+
+  const ::internal::tflite::operand::Index padding_index{node.param().padding_index};
+  const ::internal::tflite::operand::Index multipler_index{node.param().multipler_index};
+  const ::internal::tflite::operand::Index activation_index{node.param().activation_index};
+
+  const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature();
+
+  const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature();
+  const auto ker_shape = _ctx.at(ker_index).shape().asFeature();
+  const auto bias_size = _ctx.at(bias_index).shape().asVector();
+
+  auto multiplier = _ctx.at(multipler_index).asScala<int>();
+
+  // Multiplier in CLDepthwiseConvolutionLayer is supported after ARMCompute 18.05
+  assert(multiplier == 1);
+  assert(ker_shape.C == bias_size);
+  assert(ker_shape.C == ifm_shape.C * multiplier);
+
+  const PaddingCode padding_type =
+      static_cast<PaddingCode>(_ctx.at(padding_index).asScala<int32_t>());
+
+  Stride stride;
+
+  stride.vertical = _ctx.at(vstride_index).asScala<int32_t>();
+  stride.horizontal = _ctx.at(hstride_index).asScala<int32_t>();
+
+  assert((ANEURALNETWORKS_PADDING_SAME == padding_type) ||
+         (ANEURALNETWORKS_PADDING_VALID == padding_type));
+
+  // TODO Should move to the place where the operand is handled, if it is possible.
+  // Set Shape Constraints and TensorInfo
+  _builder.addShapeConstr(ofm_index, asTensorInfo(ofm_shape, _ctx.at(ofm_index).type()));
+  _builder.addShapeConstr(ifm_index, asTensorInfo(ifm_shape, _ctx.at(ifm_index).type()));
+  // NOTE DepthwiseConv2D kernel is of shape [1, KER_W, KER_H, IFM_C * MULTIPLIER]
+  _builder.addShapeConstr(ker_index, asTensorInfo(ker_shape, _ctx.at(ker_index).type()));
+  _builder.addShapeConstr(bias_index, asTensorInfo(bias_size, _ctx.at(bias_index).type()));
+
+  // Construct operation parameters
+  struct Param
+  {
+    int ofm_index;
+    int ifm_index;
+    int ker_index;
+    int bias_index;
+
+    Padding padding;
+    Stride stride;
+
+    int multipler;
+    FuseCode activation;
+  };
+
+  Param param;
+
+  param.ofm_index = ofm_index.asInt();
+  param.ifm_index = ifm_index.asInt();
+  param.ker_index = ker_index.asInt();
+  param.bias_index = bias_index.asInt();
+
+  param.stride = stride;
+  param.padding = (padding_type == ANEURALNETWORKS_PADDING_SAME)
+                      ? same_padding(ifm_shape, ofm_shape, stride, ker_shape.W, ker_shape.H)
+                      : valid_padding();
+
+  param.multipler = multiplier;
+  param.activation = static_cast<FuseCode>(_ctx.at(activation_index).asScala<int32_t>());
+
+  VERBOSE(DepthwiseConv2D) << "OFM_C: " << ofm_shape.C << std::endl;
+  VERBOSE(DepthwiseConv2D) << "OFM_H: " << ofm_shape.H << std::endl;
+  VERBOSE(DepthwiseConv2D) << "OFM_W: " << ofm_shape.W << std::endl;
+
+  VERBOSE(DepthwiseConv2D) << "IFM_C: " << ifm_shape.C << std::endl;
+  VERBOSE(DepthwiseConv2D) << "IFM_H: " << ifm_shape.H << std::endl;
+  VERBOSE(DepthwiseConv2D) << "IFM_W: " << ifm_shape.W << std::endl;
+
+  VERBOSE(DepthwiseConv2D) << "KER_C: " << ker_shape.C << std::endl;
+  VERBOSE(DepthwiseConv2D) << "KER_H: " << ker_shape.H << std::endl;
+  VERBOSE(DepthwiseConv2D) << "KER_W: " << ker_shape.W << std::endl;
+
+  VERBOSE(DepthwiseConv2D) << "STRIDE_H: " << param.stride.vertical << std::endl;
+  VERBOSE(DepthwiseConv2D) << "STRIDE_W: " << param.stride.horizontal << std::endl;
+
+  VERBOSE(DepthwiseConv2D) << "ACTIVATION: " << param.activation << std::endl;
+
+  VERBOSE(DepthwiseConv2D) << "PAD(T): " << param.padding.top << std::endl;
+  VERBOSE(DepthwiseConv2D) << "PAD(B): " << param.padding.bottom << std::endl;
+  VERBOSE(DepthwiseConv2D) << "PAD(L): " << param.padding.left << std::endl;
+  VERBOSE(DepthwiseConv2D) << "PAD(R): " << param.padding.right << std::endl;
+
+  auto stage = [param](const IAllocationContext &ctx, IExecutionBuilder &builder) {
+    auto ofm_alloc = ctx.at(::internal::tflite::operand::Index{param.ofm_index});
+    auto ifm_alloc = ctx.at(::internal::tflite::operand::Index{param.ifm_index});
+    auto ker_alloc = ctx.at(::internal::tflite::operand::Index{param.ker_index});
+    auto bias_alloc = ctx.at(::internal::tflite::operand::Index{param.bias_index});
+
+    const auto conv_info = asPadStringInfo(param.padding, param.stride);
+
+    auto fn = make_layer<::arm_compute::CLDepthwiseConvolutionLayer>();
+
+    fn->configure(ifm_alloc, ker_alloc, bias_alloc, ofm_alloc, conv_info);
+
+    builder.append(std::move(fn));
+
+    ActivationBuilder{builder}.append(param.activation, ofm_alloc);
+  };
+
+  _builder.addStage(stage);
+}
+
 void Planner::visit(const ::internal::tflite::op::MaxPool2D::implicit::Node &node)
 {
   const ::internal::tflite::operand::Index ofm_index{node.param().ofm_index};
diff --git a/runtimes/pure_arm_compute/src/internal/op/DepthwiseConv2D.cc b/runtimes/pure_arm_compute/src/internal/op/DepthwiseConv2D.cc
new file mode 100644
index 0000000..be80050
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/op/DepthwiseConv2D.cc
@@ -0,0 +1,67 @@
+#include "internal/op/DepthwiseConv2D.h"
+#include "internal/op/NodeVisitor.h"
+
+#include <cassert>
+
+namespace internal
+{
+namespace tflite
+{
+namespace op
+{
+namespace DepthwiseConv2D
+{
+namespace implicit
+{
+
+void Node::accept(NodeVisitor &&v) const { v.visit(*this); }
+
+} // namespace implicit
+} // namespace DepthwiseConv2D
+} // namespace op
+} // namespace tflite
+} // namespace internal
+
+namespace internal
+{
+namespace tflite
+{
+namespace op
+{
+namespace DepthwiseConv2D
+{
+namespace implicit
+{
+
+Param::Param(uint32_t inputCount, const uint32_t *inputs, uint32_t outputCount,
+             const uint32_t *outputs)
+{
+  assert(inputCount == 8 && outputCount == 1);
+
+  ofm_index = outputs[0];
+
+  // Each input should be interpreted as follows:
+  //
+  //  0 -> IFM Tensor Index
+  //  1 -> Kernel Tensor Index
+  //  2 -> Bias Tensor Index
+  //  3 -> Padding Code (ANEURALNETWORKS_PADDING_SAME or ANEURALNETWORKS_PADDING_VALID) Index
+  //  4 -> Stride (width) Index
+  //  5 -> Stride (height) INdex
+  //  6 -> Depthwise Multipler
+  //  7 -> Activation Index
+  ifm_index = inputs[0];
+  ker_index = inputs[1];
+  bias_index = inputs[2];
+  padding_index = inputs[3];
+  hstride_index = inputs[4];
+  vstride_index = inputs[5];
+  multipler_index = inputs[6];
+  activation_index = inputs[7];
+}
+
+} // namespace implicit
+} // namespace DepthwiseConv2D
+} // namespace op
+} // namespace tflite
+} // namespace internal
diff --git a/runtimes/pure_arm_compute/src/internal/op/DepthwiseConv2D.h b/runtimes/pure_arm_compute/src/internal/op/DepthwiseConv2D.h
new file mode 100644
index 0000000..c184b39
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/op/DepthwiseConv2D.h
@@ -0,0 +1,65 @@
+#ifndef __INTERNAL_OP_DEPTHWISE_CONV_2D_H__
+#define __INTERNAL_OP_DEPTHWISE_CONV_2D_H__
+
+#include "internal/op/Node.h"
+
+#include <cstdint>
+
+namespace internal
+{
+namespace tflite
+{
+namespace op
+{
+namespace DepthwiseConv2D
+{
+namespace implicit
+{
+
+struct Param
+{
+  int32_t ofm_index;
+
+  int32_t ifm_index;
+  int32_t ker_index;
+  int32_t bias_index;
+
+  int32_t hstride_index;
+  int32_t vstride_index;
+
+  int32_t padding_index;
+  int32_t multipler_index;
+  int32_t activation_index;
+
+  Param() = default;
+  Param(uint32_t inputCount, const uint32_t *inputs, uint32_t outputCount, const uint32_t *outputs);
+};
+
+class Node final : public op::Node
+{
+public:
+  Node(const Param &param) : _param(param)
+  {
+    // DO NOTHING
+  }
+
+public:
+  virtual ~Node() = default;
+
+public:
+  const Param &param(void) const { return _param; }
+
+public:
+  void accept(NodeVisitor &&) const override;
+
+private:
+  const Param _param;
+};
+
+} // namespace implicit
+} // namespace DepthwiseConv2D
+} // namespace op
+} // namespace tflite
+} // namespace internal
+
+#endif // __INTERNAL_OP_CONV_2D_H__
diff --git a/runtimes/pure_arm_compute/src/internal/op/NodeVisitor.h b/runtimes/pure_arm_compute/src/internal/op/NodeVisitor.h
index 69d82a3..90a1f51 100644
--- a/runtimes/pure_arm_compute/src/internal/op/NodeVisitor.h
+++ b/runtimes/pure_arm_compute/src/internal/op/NodeVisitor.h
@@ -6,6 +6,7 @@
 #include "internal/op/Mul.h"
 #include "internal/op/Div.h"
 #include "internal/op/Conv2D.h"
+#include "internal/op/DepthwiseConv2D.h"
 #include "internal/op/MaxPool2D.h"
 #include "internal/op/AvgPool2D.h"
 #include "internal/op/Concat.h"
@@ -35,6 +36,7 @@ struct NodeVisitor
   virtual void visit(const Mul::Node &) = 0;
   virtual void visit(const Div::Node &) = 0;
   virtual void visit(const Conv2D::implicit::Node &) = 0;
+  virtual void visit(const DepthwiseConv2D::implicit::Node &) = 0;
   virtual void visit(const MaxPool2D::implicit::Node &) = 0;
   virtual void visit(const AvgPool2D::implicit::Node &) = 0;
   virtual void visit(const Concat::Node &) = 0;
diff --git a/runtimes/pure_arm_compute/src/model.cc b/runtimes/pure_arm_compute/src/model.cc
index 9dafc33..c9f768e 100644
--- a/runtimes/pure_arm_compute/src/model.cc
+++ b/runtimes/pure_arm_compute/src/model.cc
@@ -160,6 +160,31 @@ int ANeuralNetworksModel_addOperation(ANeuralNetworksModel *model,
 
       break;
     }
+    case ANEURALNETWORKS_DEPTHWISE_CONV_2D:
+    {
+      // inputCount is either 8 or 10 acccording to NN API specification.
+      //  - Padding is implicit when inputCount is 8
+      //  - Padding is explicit when inputCount is 10
+      assert(inputCount == 8 || inputCount == 10);
+      assert(outputCount == 1);
+
+      if (inputCount == 8)
+      {
+        using internal::tflite::op::DepthwiseConv2D::implicit::Param;
+        using internal::tflite::op::DepthwiseConv2D::implicit::Node;
+
+        // Add 'operations'
+        auto &operations = model->deref().operations();
+
+        operations.emplace_back<Node>(Param{inputCount, inputs, outputCount, outputs});
+      }
+      else
+      {
+        throw std::runtime_error{"Explicit padding is not supported, yet"};
+      }
+
+      break;
+    }
     case ANEURALNETWORKS_MAX_POOL_2D:
     {
       // inputCount is either 7 or 9 acccording to NN API specification.
-- 
2.7.4