From 4593a7296b99ff7ca6ece3a22828aff0bc7800b3 Mon Sep 17 00:00:00 2001
From: =?utf8?q?=D0=A1=D0=B5=D1=80=D0=B3=D0=B5=D0=B9=20=D0=91=D0=B0=D1=80?=
 =?utf8?q?=D0=B0=D0=BD=D0=BD=D0=B8=D0=BA=D0=BE=D0=B2/AI=20Tools=20Lab=20/S?=
 =?utf8?q?RR/Engineer/=EC=82=BC=EC=84=B1=EC=A0=84=EC=9E=90?=
 <s.barannikov@samsung.com>
Date: Tue, 22 Jan 2019 16:02:14 +0300
Subject: [PATCH] [nnc] Support for non-1 batch in Conv2D and DepthwiseConv2D
 operations (#2902)

* Add support for non-1 batch size in Conv2D and DepthwiseConv2D in interpreter backend.
* Fix a bug in ShapeRange class that caused undefined behavior when it is constructed from a temporary.

Signed-off-by: Sergei Barannikov <s.barannikov@samsung.com>
---
 contrib/nnc/include/core/modelIR/ShapeRange.h      |   7 +-
 contrib/nnc/passes/interpreter/ops/Conv2D.cpp      | 110 ++++++++-------------
 .../nnc/passes/interpreter/ops/DepthwiseConv2D.cpp |  96 ++++++++----------
 .../nnc/unittests/soft_backend/CPPOperations.cpp   |   6 +-
 4 files changed, 88 insertions(+), 131 deletions(-)
diff --git a/contrib/nnc/include/core/modelIR/ShapeRange.h b/contrib/nnc/include/core/modelIR/ShapeRange.h
index fb7f05e..12bf83b 100644
--- a/contrib/nnc/include/core/modelIR/ShapeRange.h
+++ b/contrib/nnc/include/core/modelIR/ShapeRange.h
@@ -75,8 +75,9 @@ class ShapeIter :
 
 class ShapeRange {
  public:
-  explicit ShapeRange(const Shape &shape) : _shape(const_cast<Shape&>(shape))
-  {}
+  explicit ShapeRange(const Shape& shape) : _shape(shape) {}
+
+  explicit ShapeRange(Shape&& shape) : _shape(std::move(shape)) {}
 
   ShapeIter begin() {
     return ShapeIter(_shape, 0);
@@ -96,7 +97,7 @@ class ShapeRange {
   }
 
  private:
-  Shape& _shape;
+  Shape _shape;
 };
 
 } // namespace mir
diff --git a/contrib/nnc/passes/interpreter/ops/Conv2D.cpp b/contrib/nnc/passes/interpreter/ops/Conv2D.cpp
index 4ee14a2..176587c 100644
--- a/contrib/nnc/passes/interpreter/ops/Conv2D.cpp
+++ b/contrib/nnc/passes/interpreter/ops/Conv2D.cpp
@@ -15,80 +15,57 @@
  */
 
 #include "Conv2D.h"
-#include "common.h"
 #include "core/modelIR/ShapeRange.h"
-#include <cmath>
 
-namespace nnc
-{
+namespace nnc {
 
 using namespace mir;
-using namespace mir::ops;
-
-Index reduce(const Index &idx)
-{
-  Index res = idx;
-  res.resize(idx.rank() - 1);
-  return res;
-}
 
 // Mostly compatible with tensorflow implementation
 // Assuming input is in NHWC format with batch omitted( [in_height, in_width, in_channels] )
 // Kernel is in [filter_height, filter_width, in_channels, out_channels]
 // Refer to https://www.tensorflow.org/api_docs/python/tf/nn/conv2d for info
-std::vector<TensorVariant> Conv2D::operator()()
-{
-  auto res = allocate_tensor(_op.getOutputShape(0));
-  Tensor<float> resAccesor(res);
-  Shape strides{_op.getStrides().dim(0), _op.getStrides().dim(1), 1};
-  Index pads{_op.getPaddingBefore().at(0), _op.getPaddingBefore().at(1), 0};
-
-  Shape outShape = resAccesor.getShape();
-  // Assume batch size == 1 and strip it off.
-  assert(outShape.dim(0) == 1);
-  outShape = {outShape.dim(1), outShape.dim(2), outShape.dim(3)};
-
-  outShape.dim(2) = 1;
-  ShapeRange outRange(outShape);
-
-  Shape inShape = _input.getShape();
-  // Assume batch size == 1 and strip it off.
-  assert(inShape.dim(0) == 1);
-  inShape = {inShape.dim(1), inShape.dim(2), inShape.dim(3)};
-
-  ShapeRange inRange(inShape);
+std::vector<TensorVariant> Conv2D::operator()() {
+  const Shape& in_shape = _op.getInputShape(0);
+  const Shape& kernel_shape = _op.getInputShape(1);
+  const Shape& out_shape = _op.getOutputShape(0);
+  const Shape& strides = _op.getStrides();
+  const std::vector<int32_t>& pads = _op.getPaddingBefore();
+
+  assert(in_shape.rank() == 4);
+  assert(kernel_shape.rank() == 4);
+  assert(kernel_shape.dim(2) == in_shape.dim(3));
+  assert(kernel_shape.dim(3) == out_shape.dim(3));
+  assert(strides.rank() == 2);
+  assert(pads.size() == 2);
+
+  int32_t num_kernels = kernel_shape.dim(3);
 
-  Shape kShape = _kernel.getShape();
-  int32_t numKernels = kShape.dim(3);
-  kShape.dim(3) = 1;
-  ShapeRange kernelRange(kShape);
-
-  Index inputIdx;
-  inputIdx.resize(inShape.rank());
-
-  for (auto &outIdx : outRange)
-  {
-    // Take into account stripped off batch dimension.
-    Index tmp_out_index{0, outIdx.at(0), outIdx.at(1), outIdx.at(2)};
-
-    for (auto& kernelIdx : kernelRange)
-    {
-      translate(inputIdx, outIdx, kernelIdx, strides, pads);
-      if (inRange.contains(inputIdx))
-      {
-        auto kernelRegion = _kernel.getRegion(kernelIdx);
-        assert( kernelRegion.size() == numKernels );
-
-        auto outRegion = resAccesor.getRegion(tmp_out_index);
-        assert( outRegion.size() == numKernels );
-
-        // Take into account stripped off batch dimension.
-        Index tmp_in_index{0, inputIdx.at(0), inputIdx.at(1), inputIdx.at(2)};
-        auto in = _input.at(tmp_in_index);
-
-        for (int32_t kernelIndex = 0; kernelIndex < numKernels; ++kernelIndex)
-        {
-          outRegion.base()[kernelIndex] += in * kernelRegion.base()[kernelIndex];
+  auto res = allocate_tensor(_op.getOutputShape(0));
+  Tensor<float> res_accessor(res);
+
+  ShapeRange in_range(in_shape);
+  ShapeRange out_range(Shape{out_shape.dim(0), out_shape.dim(1), out_shape.dim(2), 1});
+  ShapeRange kernel_range(Shape{kernel_shape.dim(0), kernel_shape.dim(1), kernel_shape.dim(2), 1});
+
+  Index in_index;
+  in_index.resize(4);
+
+  for (const auto& out_index : out_range) {
+    auto out_region = res_accessor.getRegion(out_index);
+    assert(out_region.size() == num_kernels);
+    for (const auto& kernel_index : kernel_range) {
+      in_index.at(0) = out_index.at(0);
+      for (int i = 0; i < 2; ++i)
+        in_index.at(1 + i) = out_index.at(1 + i) * strides.dim(i) + kernel_index.at(i) - pads[i];
+      in_index.at(3) = kernel_index.at(2);
+
+      if (in_range.contains(in_index)) {
+        auto kernel_region = _kernel.getRegion(kernel_index);
+        assert(kernel_region.size() == num_kernels);
+        float in_val = _input.at(in_index);
+        for (int32_t kernel_i = 0; kernel_i < num_kernels; ++kernel_i) {
+          out_region.base()[kernel_i] += in_val * kernel_region.base()[kernel_i];
         }
       }
     }
@@ -99,11 +76,8 @@ std::vector<TensorVariant> Conv2D::operator()()
 
 Conv2D::Conv2D(const TensorVariant& input,
                const TensorVariant& kernel,
-               const Conv2DOp& op)
+               const ops::Conv2DOp& op)
     : _input(input), _kernel(kernel), _op(op) {
-  assert(_op.getInputShape(0).rank() == 4);
-  assert(_input.getShape().rank() == 4);
-  assert(_kernel.getShape().rank() == 4);
 }
 
 } // namespace nnc
diff --git a/contrib/nnc/passes/interpreter/ops/DepthwiseConv2D.cpp b/contrib/nnc/passes/interpreter/ops/DepthwiseConv2D.cpp
index dfb4308..8582db2 100644
--- a/contrib/nnc/passes/interpreter/ops/DepthwiseConv2D.cpp
+++ b/contrib/nnc/passes/interpreter/ops/DepthwiseConv2D.cpp
@@ -15,63 +15,49 @@
  */
 
 #include "DepthwiseConv2D.h"
-#include "common.h"
 #include "core/modelIR/ShapeRange.h"
 
-namespace nnc
-{
+namespace nnc {
 
 using namespace mir;
-using namespace mir::ops;
 
-std::vector<TensorVariant> DepthwiseConv2D::operator()()
-{
-  TensorVariant res = allocate_tensor(_op.getOutputShape(0));
-  Tensor<float> resAccessor(res);
-
-  Shape strides({_op.getStrides().dim(0), _op.getStrides().dim(1), 1});
-  Index pads({_op.getPaddingBefore().at(0), _op.getPaddingBefore().at(1), 0});
-
-  Shape outShape = res.getShape();
-  // Assume batch size == 1 and strip it off.
-  assert(outShape.dim(0) == 1);
-  outShape = {outShape.dim(1), outShape.dim(2), outShape.dim(3)};
-
-  outShape.dim(2) = 1;
-  ShapeRange outRange(outShape);
-
-  Shape inShape = _input.getShape();
-  // Assume batch size == 1 and strip it off.
-  assert(inShape.dim(0) == 1);
-  inShape = {inShape.dim(1), inShape.dim(2), inShape.dim(3)};
-
-  ShapeRange inRange(inShape);
-
-  Index inIdx;
-  inIdx.resize(outShape.rank());
-
-  auto kernelShape = _kernel.getShape();
-  int32_t channelMultiplierDim = kernelShape.rank() - 1;
-  int channelMultiplier = kernelShape.dim(channelMultiplierDim);
-
-  for (auto &outIdx : outRange)
-  {
-    // Take into account stripped off batch dimension.
-    Index tmp_out_index{0, outIdx.at(0), outIdx.at(1), outIdx.at(2)};
-
-    for (auto &kIdx : ShapeRange(kernelShape))
-    {
-      translate(inIdx, outIdx, kIdx, strides, pads);
-
-      if (inRange.contains(inIdx))
-      {
-        // Take into account stripped off batch dimension.
-        Index tmp_in_index{0, inIdx.at(0), inIdx.at(1), inIdx.at(2)};
-        auto in = _input.at(tmp_in_index);
-        auto b = _kernel.at(kIdx);
-        Index outIdxK = tmp_out_index;
-        outIdxK.at(3) = kIdx.at(2) * channelMultiplier + kIdx.at(channelMultiplierDim);
-        resAccessor.at(outIdxK) += in * b;
+std::vector<TensorVariant> DepthwiseConv2D::operator()() {
+  const Shape& in_shape = _op.getInputShape(0);
+  const Shape& kernel_shape = _op.getInputShape(1);
+  const Shape& out_shape = _op.getOutputShape(0);
+  const Shape& strides = _op.getStrides();
+  const std::vector<int32_t>& pads = _op.getPaddingBefore();
+
+  assert(in_shape.rank() == 4);
+  assert(kernel_shape.rank() == 4);
+  assert(kernel_shape.dim(2) == in_shape.dim(3));
+  assert(in_shape.dim(3) * kernel_shape.dim(3) == out_shape.dim(3));
+  assert(strides.rank() == 2);
+  assert(pads.size() == 2);
+
+  int32_t channel_multiplier = kernel_shape.dim(3);
+
+  TensorVariant res = allocate_tensor(out_shape);
+  Tensor<float> res_accessor(res);
+
+  ShapeRange in_range(in_shape);
+  ShapeRange kernel_range(kernel_shape);
+  ShapeRange out_range(Shape{out_shape.dim(0), out_shape.dim(1), out_shape.dim(2), 1});
+
+  Index in_index;
+  in_index.resize(4);
+
+  for (const auto& out_index : out_range) {
+    Index out_index_k = out_index;
+    for (const auto& kernel_index : kernel_range) {
+      in_index.at(0) = out_index.at(0);
+      for (int i = 0; i < 2; ++i)
+        in_index.at(1 + i) = out_index.at(1 + i) * strides.dim(i) + kernel_index.at(i) - pads[i];
+      in_index.at(3) = kernel_index.at(2);
+
+      if (in_range.contains(in_index)) {
+        out_index_k.at(3) = kernel_index.at(2) * channel_multiplier + kernel_index.at(3);
+        res_accessor.at(out_index_k) += _input.at(in_index) * _kernel.at(kernel_index);
       }
     }
   }
@@ -81,12 +67,8 @@ std::vector<TensorVariant> DepthwiseConv2D::operator()()
 
 DepthwiseConv2D::DepthwiseConv2D(const TensorVariant& input,
                                  const TensorVariant& kernel,
-                                 const DepthwiseConv2DOp& op)
+                                 const ops::DepthwiseConv2DOp& op)
     : _input(input), _kernel(kernel), _op(op) {
-  assert(_op.getInputShape(0).rank() == 4);
-  assert(_input.getShape().rank() == 4);
-  assert(_kernel.getShape().rank() == 4);
-  assert(_kernel.getShape().dim(2) == _input.getShape().dim(3));
 }
 
 } // namespace nnc
diff --git a/contrib/nnc/unittests/soft_backend/CPPOperations.cpp b/contrib/nnc/unittests/soft_backend/CPPOperations.cpp
index 92f5f27..4c6fcb8 100644
--- a/contrib/nnc/unittests/soft_backend/CPPOperations.cpp
+++ b/contrib/nnc/unittests/soft_backend/CPPOperations.cpp
@@ -625,7 +625,7 @@ TEST(cpp_operations_test, conv2d) {
         for (iT output_c = 1; output_c <= 3; ++output_c)
           for (iT stride_h = 1; stride_h <= 3; ++stride_h)
             for (iT stride_w = 1; stride_w <= 3; ++stride_w) {
-              vector<int> input_shape_data{1, 5, 7, static_cast<int>(input_c)};  // NHWC
+              vector<int> input_shape_data{3, 5, 7, static_cast<int>(input_c)};  // NHWC
               vector<int> kernel_shape_data{kernel_h, kernel_w, input_c, output_c}; // HWCN
               mir::Shape strides{stride_h, stride_w};
               vector<unique_ptr<mir::TensorVariant>> input_ntensors(2);
@@ -658,7 +658,7 @@ TEST(cpp_operations_test, depthwise_conv) {
         for (iT stride_w = 1; stride_w <= 3; ++stride_w)
           for (iT stride_h = 1; stride_h <= 3; ++stride_h)
             for (iT multiplier = 1; multiplier <= 2; ++multiplier) {
-              vector<int> input_shape_data{1, 7, 6, static_cast<int>(channels)};  // NHWC
+              vector<int> input_shape_data{3, 7, 6, static_cast<int>(channels)};  // NHWC
               vector<int> kernel_shape_data{kernel_h, kernel_w, channels, multiplier}; // HWCN
               mir::Shape strides{stride_h, stride_w};
               vector<unique_ptr<mir::TensorVariant>> input_ntensors(2);
@@ -764,7 +764,7 @@ static void genericPoolTest(Func test_func, const vector<irOps::PoolOp::BorderTy
       for (iT channels = 1; channels <= 2; ++channels)
         for (iT stride_h = 1; stride_h <= 3; ++stride_h)
           for (iT stride_w = 1; stride_w <= 3; ++stride_w) {
-            vector<int> shape_data{1, 5, 7, static_cast<int>(channels)};
+            vector<int> shape_data{3, 5, 7, static_cast<int>(channels)};
             mir::Shape window_shape{windowH, windowW};
             mir::Shape strides{stride_h, stride_w};
             Tensor input_atensor;
-- 
2.7.4