[nnc] Added Broadcasting (#2417)
authorАндрей Шедько/AI Tools Lab /SRR/Engineer/삼성전자 <a.shedko@samsung.com>
Thu, 13 Dec 2018 17:36:52 +0000 (20:36 +0300)
committerEfimov Alexander/AI Tools Lab/./Samsung Electronics <a.efimov@samsung.com>
Thu, 13 Dec 2018 17:36:52 +0000 (20:36 +0300)
- Added Broadcasting to interpreter and shapeInference.
- Fixed interpreter impl and placed const in correct location
- Added broadcasting to softBackend
- Added tests and cpp backend implementations for broadcasted `+, -, *, /, max`
- Added generic implementation of Elementwise Broadcast for ease of
- fixed codestyle; removed shape inference file

Signed-off-by: Andrei Shedko <a.shedko@samsung.com>
13 files changed:
contrib/nnc/core/CMakeLists.txt
contrib/nnc/core/modelIR/TensorVariant.cpp
contrib/nnc/core/modelIR/operations/ElementwiseOp.cpp [new file with mode: 0644]
contrib/nnc/include/core/modelIR/TensorVariant.h
contrib/nnc/include/core/modelIR/operations/ElementwiseOp.h
contrib/nnc/passes/interpreter/Interpreter.cpp
contrib/nnc/passes/interpreter/ops/ElementwiseOp.cpp [deleted file]
contrib/nnc/passes/soft_backend/SBSerializer.cpp
contrib/nnc/passes/soft_backend/code_snippets/cpp_common_funcs.def
contrib/nnc/passes/soft_backend/code_snippets/cpp_elementwise.def
contrib/nnc/passes/soft_backend/code_snippets/cpp_operations.def
contrib/nnc/unittests/core/ShapeInference.cpp
contrib/nnc/unittests/soft_backend/CPPOperations.cpp

index f2713f8..50dc209 100644 (file)
@@ -2,6 +2,7 @@ set(SOURCES "modelIR/operations/ConcatOp.cpp"
             "modelIR/operations/Conv2DOp.cpp"
             "modelIR/operations/DeConv2DOp.cpp"
             "modelIR/operations/DepthwiseConv2DOp.cpp"
+            "modelIR/operations/ElementwiseOp.cpp"
             "modelIR/operations/FullyConnectedOp.cpp"
             "modelIR/operations/GatherOp.cpp"
             "modelIR/operations/GemmOp.cpp"
index 35bb440..45f3696 100644 (file)
@@ -22,9 +22,9 @@ namespace mir
 {
 
 TensorVariant::TensorVariant(const Shape& shape, const std::shared_ptr<char>& data,
-                             DTYPE dtype, size_t element_size)
-    : _dtype(dtype), _data(data), _strides{0},  _rank(shape.rank()),
-      _shape(shape), _element_size(element_size)
+                             DTYPE dtype, size_t element_size) :
+  _dtype(dtype), _data(data), _strides{0}, _rank(shape.rank()),
+  _shape(shape), _elementSize(element_size)
 {
   int stride = 1;
   for (int d = _rank - 1; d >= 0; --d)
@@ -34,9 +34,31 @@ TensorVariant::TensorVariant(const Shape& shape, const std::shared_ptr<char>& da
   }
 }
 
+/**
+ * @brief Construct a TensorVariant from t_old that has strides with 0 where dim = 1
+ * Used for broadcasting
+ * @param t_old TensorVariant to use as base
+ * @param shape shape to broadcast to
+ */
+TensorVariant::TensorVariant(const TensorVariant& t_old,
+                             const Shape& shape)
+  : _dtype(t_old._dtype), _data(t_old._data), _strides{0}, _rank(shape.rank()),
+    _shape(shape), _elementSize(t_old._elementSize) {
+  int axis_old = t_old._rank - 1;
+  for (int d = _rank - 1; d >= 0; d--) {
+    if (t_old._shape.dim(axis_old) == 1)
+      _strides[d] = 0;
+    else
+      _strides[d] = t_old._strides[axis_old];
+    axis_old--;
+    if (axis_old == -1)
+      break;
+  }
+}
+
 char *TensorVariant::at(const Index &idx) const
 {
-  return _data.get() + getOffset(idx) * _element_size;
+  return _data.get() + getOffset(idx) * _elementSize;
 }
 
 size_t TensorVariant::getOffset(const Index &idx) const {
diff --git a/contrib/nnc/core/modelIR/operations/ElementwiseOp.cpp b/contrib/nnc/core/modelIR/operations/ElementwiseOp.cpp
new file mode 100644 (file)
index 0000000..28d0d3b
--- /dev/null
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "core/modelIR/operations/ElementwiseOp.h"
+
+namespace nnc {
+namespace mir {
+namespace ops {
+
+void ElementwiseOp::inferOutputShapes() {
+  int max_rank = getInputShape(0).rank();
+  size_t max_ind = 0;
+  for (size_t i = 0; i < getNumInputs(); i++) {
+    if (max_rank < getInputShape(i).rank()) {
+      max_rank = getInputShape(i).rank();
+      max_ind = i;
+    }
+  }
+  Shape max_shape = getInputShape(max_ind);
+  for (size_t i = 0; i < getNumInputs(); i++) {
+    const auto& current_shape = getInputShape(i);
+    _needsBroadcast = _needsBroadcast || max_shape != current_shape; // check not equal
+    const int rank = current_shape.rank();
+    for (int axis = 0; axis < rank; axis++) {
+      auto current_dim = current_shape.dim(rank - axis - 1);
+      // get max for all axes
+      if (max_shape.dim(max_rank - axis - 1) == 1 && current_dim != 1) {
+        max_shape.dim(max_rank - axis - 1) = current_dim;
+      } else {
+        assert((current_dim == 1 || current_dim == max_shape.dim(max_rank - axis - 1))
+               && "Incompatible shapes in broadcast!");
+      }
+    }
+  }
+  setOutputShape(0, max_shape);
+}
+
+} // namespace ops
+} // namespace mir
+} // namespace nnc
index ba574e4..07495d0 100644 (file)
@@ -37,6 +37,8 @@ public:
 
   explicit TensorVariant(const Shape& shape, const std::shared_ptr<char>& data, DTYPE dtype, size_t element_size);
 
+  explicit TensorVariant(const TensorVariant& t_old, const Shape& shape);
+
   template<typename T>
   explicit TensorVariant(const Shape& shape, const std::shared_ptr<T>& data, DTYPE dtype) :
     TensorVariant(
@@ -53,7 +55,8 @@ public:
 
   virtual const Shape &getShape() const { return _shape; }
   DTYPE getDataType() const { return _dtype; }
-  size_t getElementSize() const { return _element_size; }
+
+  size_t getElementSize() const { return _elementSize; }
 
  private:
   DTYPE _dtype;
@@ -62,7 +65,7 @@ public:
   size_t _rank;
   Shape _shape;
 
-  size_t _element_size;
+  size_t _elementSize;
 };
 
 } // namespace mir
index 559cbee..0b61535 100644 (file)
@@ -39,14 +39,17 @@ public:
    * @param num_inputs Number of inputs
    */
   ElementwiseOp(const std::vector<IODescriptor>& args, OpType op_type)
-    : Operation(Type::elementwise, args), _opType(op_type) {
-    // Infer output shape.
-    // TODO Check that all inputs have the same shape.
-    setOutputShape(0, getInputShape(0));
+    : Operation(Type::elementwise, args), _opType(op_type), _needsBroadcast(false) {
+    inferOutputShapes();
   };
 
+  bool getBroadcast() const { return _needsBroadcast; }
+
 private:
+  void inferOutputShapes();
+
   OpType _opType;
+  bool _needsBroadcast;
 
 public:
   OpType getOpType() const { return _opType; }
index 341f0a7..6811411 100644 (file)
@@ -237,8 +237,18 @@ void NNInterpreter::visit(ops::ElementwiseOp& op) {
   mapByName(&op);
   auto operands = op.getPrevNodes();
   std::vector<Tensor<float>> ins;
+  // Reserve space for tensor variants to avoid reference invalidation when pushing into vector
+  std::vector<TensorVariant> broadcasted{};
+  broadcasted.reserve(op.getNumInputs());
+
   for (auto &in : operands) {
-    ins.emplace_back(var(in.op->getId())[in.index]);
+    auto& tmp = var(in.op->getId())[in.index];
+    if (op.getBroadcast()) {
+      broadcasted.emplace_back(tmp, op.getOutputShape(0));
+      ins.emplace_back(broadcasted.back());
+    } else {
+      ins.emplace_back(tmp);
+    }
   }
   float (*func)(float,float); // Another dirty hack
   switch (op.getOpType()) {
@@ -257,13 +267,14 @@ void NNInterpreter::visit(ops::ElementwiseOp& op) {
     default:
       assert(false && "Unsupported Optype");
   }
-
-  var(op.getId()) = Fill<float>(op.getOutputShape(0), [&func, &ins, &op](const Index &id) {
-    float acc = ins[0].at(id);
-    for (size_t i = 1; i < ins.size() ; i++)
-      acc = func(acc, ins[i].at(id));
-    return acc;
-  })();
+  var(op.getId()) = Fill<float>(
+    op.getOutputShape(0),
+    [&func, &ins](const Index& id) {
+      float acc = ins[0].at(id);
+      for (size_t i = 1; i < ins.size(); i++)
+        acc = func(acc, ins[i].at(id));
+      return acc;
+    })();
 }
 
 void NNInterpreter::visit(ops::DeConv2DOp& op) {
diff --git a/contrib/nnc/passes/interpreter/ops/ElementwiseOp.cpp b/contrib/nnc/passes/interpreter/ops/ElementwiseOp.cpp
deleted file mode 100644 (file)
index 768cfa1..0000000
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "Elementwise.h"
-//do not delete
-//used to force compilation of elementwise.h
index c4449a8..ee7f565 100644 (file)
@@ -298,7 +298,9 @@ void Serializer::visit(mir::ops::TanhOp& op) {
 void Serializer::visit(mir::ops::ElementwiseOp& op) {
   _curOp->_paramStartOffset = _buffer.size();
   // Op type is known at codegen Time
-  serializeT<int>((int32_t) op.getNumInputs());
+  serializeT<int>((int32_t)op.getNumInputs());
+  serializeT<int>((int32_t)op.getBroadcast());
+  serializeShape(op.getOutputShape(0));
 }
 
 void Serializer::visit(mir::ops::EluOp& op) {
index a538249..1659122 100644 (file)
@@ -126,6 +126,14 @@ public:
     BuildFrom(init_list);
   }
 
+  // get bigger shape for elementwise Ops
+  void maxShape(RuntimeShape const& other) {
+    TFLITE_CHECK(other.DimensionsCount() == size_ && size_ == 4 && "Elementwise shapes must be 4d");
+    for (size_t i = 0; i < 4; i++) {
+      dims_[i] = std::max(dims_[i], other.dims_[i]);
+    }
+  }
+
   // Avoid using this constructor.  We should be able to delete it when C++17
   // rolls out.
   RuntimeShape(RuntimeShape const& other) : size_(other.DimensionsCount()) {
@@ -341,6 +349,41 @@ int MatchingArraySize(const ArrayType1& array1, int index1,
   return ArraySize(array1, index1);
 }
 
+// Flat size calculation, checking that dimensions match with one or more other
+// arrays.
+inline int MatchingFlatSize(const RuntimeShape& shape,
+                            const RuntimeShape& check_shape_0) {
+  TFLITE_DCHECK_EQ(shape.DimensionsCount(), check_shape_0.DimensionsCount());
+  const int dims_count = shape.DimensionsCount();
+  for (int i = 0; i < dims_count; ++i) {
+    TFLITE_DCHECK_EQ(shape.Dims(i), check_shape_0.Dims(i));
+  }
+  return shape.FlatSize();
+}
+
+inline int MatchingFlatSize(const RuntimeShape& shape,
+                            const RuntimeShape& check_shape_0,
+                            const RuntimeShape& check_shape_1) {
+  TFLITE_DCHECK_EQ(shape.DimensionsCount(), check_shape_0.DimensionsCount());
+  const int dims_count = shape.DimensionsCount();
+  for (int i = 0; i < dims_count; ++i) {
+    TFLITE_DCHECK_EQ(shape.Dims(i), check_shape_0.Dims(i));
+  }
+  return MatchingFlatSize(shape, check_shape_1);
+}
+
+inline int MatchingFlatSize(const RuntimeShape& shape,
+                            const RuntimeShape& check_shape_0,
+                            const RuntimeShape& check_shape_1,
+                            const RuntimeShape& check_shape_2) {
+  TFLITE_DCHECK_EQ(shape.DimensionsCount(), check_shape_0.DimensionsCount());
+  const int dims_count = shape.DimensionsCount();
+  for (int i = 0; i < dims_count; ++i) {
+    TFLITE_DCHECK_EQ(shape.Dims(i), check_shape_0.Dims(i));
+  }
+  return MatchingFlatSize(shape, check_shape_1, check_shape_2);
+}
+
 template <int N>
 inline int MatchingFlatSize(const Dims<N>& dims, const Dims<N>& check_dims_0) {
   for (int i = 0; i < N; ++i) {
@@ -439,6 +482,11 @@ VectorMap<Scalar> MapAsVector(Scalar* data, const Dims<N>& dims) {
 }
 
 template <typename Scalar>
+VectorMap<Scalar> MapAsVector(Scalar* data, const size_t size) {
+  return VectorMap<Scalar>(data, size, 1);
+}
+
+template <typename Scalar>
 using MatrixMap = typename std::conditional<
     std::is_const<Scalar>::value,
     Eigen::Map<const Eigen::Matrix<typename std::remove_const<Scalar>::type,
@@ -585,3 +633,79 @@ struct TransposeParams {
   int8 perm_count;
   int32 perm[4];
 };
+
+// DO NOT USE THIS STRUCT FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING
+// BROADCASTING.
+//
+// NdArrayDesc<N> describes the shape and memory layout of an N-dimensional
+// rectangular array of numbers.
+//
+// NdArrayDesc<N> is basically identical to Dims<N> defined in types.h.
+// However, as Dims<N> is to be deprecated, this class exists as an adaptor
+// to enable simple unoptimized implementations of element-wise broadcasting
+// operations.
+template <int N>
+struct NdArrayDesc {
+  // The "extent" of each dimension. Indices along dimension d must be in the
+  // half-open interval [0, extents[d]).
+  int extents[N];
+
+  // The number of *elements* (not bytes) between consecutive indices of each
+  // dimension.
+  int strides[N];
+};
+
+// DO NOT USE THIS FUNCTION FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING
+// BROADCASTING.
+//
+// Same as Offset(), except takes as NdArrayDesc<N> instead of Dims<N>.
+inline int SubscriptToIndex(const NdArrayDesc<4>& desc, int i0, int i1, int i2,
+                            int i3) {
+  TFLITE_DCHECK(i0 >= 0 && i0 < desc.extents[0]);
+  TFLITE_DCHECK(i1 >= 0 && i1 < desc.extents[1]);
+  TFLITE_DCHECK(i2 >= 0 && i2 < desc.extents[2]);
+  TFLITE_DCHECK(i3 >= 0 && i3 < desc.extents[3]);
+  return i0 * desc.strides[0] + i1 * desc.strides[1] + i2 * desc.strides[2] +
+         i3 * desc.strides[3];
+}
+
+template <int N>
+inline void NdArrayDescsForElementwiseBroadcast(
+  const RuntimeShape& input0_shape, const RuntimeShape& input1_shape,
+  NdArrayDesc<N>* desc0_out, NdArrayDesc<N>* desc1_out) {
+  TFLITE_DCHECK(desc0_out != nullptr);
+  TFLITE_DCHECK(desc1_out != nullptr);
+
+  auto extended_input0_shape = RuntimeShape::ExtendedShape(N, input0_shape);
+  auto extended_input1_shape = RuntimeShape::ExtendedShape(N, input1_shape);
+
+  // Copy dims to desc, calculating strides.
+  int desc0_stride = 1;
+  int desc1_stride = 1;
+  for (int i = N - 1; i >= 0; --i) {
+    desc0_out->extents[i] = extended_input0_shape.Dims(i);
+    desc0_out->strides[i] = desc0_stride;
+    desc0_stride *= extended_input0_shape.Dims(i);
+    desc1_out->extents[i] = extended_input1_shape.Dims(i);
+    desc1_out->strides[i] = desc1_stride;
+    desc1_stride *= extended_input1_shape.Dims(i);
+  }
+
+  // Walk over each dimension. If the extents are equal do nothing.
+  // Otherwise, set the desc with extent 1 to have extent equal to the other and
+  // stride 0.
+  for (int i = 0; i < N; ++i) {
+    const int extent0 = extended_input0_shape.Dims(i);
+    const int extent1 = extended_input1_shape.Dims(i);
+    if (extent0 != extent1) {
+      if (extent0 == 1) {
+        desc0_out->strides[i] = 0;
+        desc0_out->extents[i] = extent1;
+      } else {
+        TFLITE_DCHECK_EQ(extent1, 1);
+        desc1_out->strides[i] = 0;
+        desc1_out->extents[i] = extent0;
+      }
+    }
+  }
+}
index e72c900..765ab99 100644 (file)
@@ -13,12 +13,66 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+// TODO(ycling): Refactoring. Remove BroadcastLogical and use the more
+// generalized and efficient BroadcastBinaryFunction.
+//
+// Also appears to duplicte MinimumMaximum.
+//
+// R: Result type. T1: Input 1 type. T2: Input 2 type.
+template <typename R, typename T1, typename T2>
+inline void BroadcastBinaryFunction4DSlow(
+  const RuntimeShape& unextended_input1_shape, const T1* input1_data,
+  const RuntimeShape& unextended_input2_shape, const T2* input2_data,
+  const RuntimeShape& unextended_output_shape, R* output_data,
+  R (* func)(T1, T2)) {
+  TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+  const RuntimeShape output_shape =
+    RuntimeShape::ExtendedShape(4, unextended_output_shape);
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
+                                      unextended_input2_shape, &desc1, &desc2);
+
+  for (int b = 0; b < output_shape.Dims(0); ++b) {
+    for (int y = 0; y < output_shape.Dims(1); ++y) {
+      for (int x = 0; x < output_shape.Dims(2); ++x) {
+        for (int c = 0; c < output_shape.Dims(3); ++c) {
+          auto out_idx = Offset(output_shape, b, y, x, c);
+          auto in1_idx = SubscriptToIndex(desc1, b, y, x, c);
+          auto in2_idx = SubscriptToIndex(desc2, b, y, x, c);
+          auto in1_val = input1_data[in1_idx];
+          auto in2_val = input2_data[in2_idx];
+          output_data[out_idx] = func(in1_val, in2_val);
+        }
+      }
+    }
+  }
+}
+
+// R: Result type. T1: Input 1 type. T2: Input 2 type.
+// TODO(renjieliu): Refactor other binary functions to use this one.
+template <typename R, typename T1, typename T2>
+inline void BinaryFunction(const RuntimeShape& input1_shape,
+                           const T1* input1_data,
+                           const RuntimeShape& input2_shape,
+                           const T2* input2_data,
+                           const RuntimeShape& output_shape, R* output_data,
+                           R (* func)(T1, T2)) {
+  const int flat_size =
+    MatchingFlatSize(input1_shape, input2_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = func(input1_data[i], input2_data[i]);
+  }
+}
+
 struct Add {
   static inline void Add_(const float* input1_data, const float* input2_data,
-                  float* output_data, const int size) {
+                          float* output_data, const int size) {
     int i = 0;
 #ifdef USE_NEON
-
     for (; i <= size - 16; i += 16) {
       auto a10 = vld1q_f32(input1_data + i);
       auto a11 = vld1q_f32(input1_data + i + 4);
@@ -50,28 +104,62 @@ struct Add {
     }
   }
 
-  static inline void Call(const float* input1_data, const float* input2_data,
-                          float* output_data, Dims<4> dims) {
-    Add_(input1_data, input2_data, output_data, FlatSize(dims));
+  static inline void Call(
+    const float* input1_data, RuntimeShape in1_shape,
+    const float* input2_data, RuntimeShape in2_shape,
+    float* output_data, RuntimeShape out_shape,
+    bool needsBroadcast) {
+    if (needsBroadcast) {
+      BroadcastBinaryFunction4DSlow<float, float, float>(
+        in1_shape, input1_data,
+        in2_shape, input2_data,
+        out_shape, output_data,
+        [](float a, float b) { return a + b; }
+      );
+    } else {
+      Add_(input1_data, input2_data, output_data, out_shape.FlatSize());
+    }
   }
 };
 
 struct Max {
-  static inline void Call(const float* input1_data, const float* input2_data,
-                          float* output_data, Dims<4> dims) {
-    auto output = MapAsVector(output_data, dims);
-    output = output.cwiseMax( MapAsVector(input2_data, dims) );
+  static inline void Call(
+    const float* input1_data, RuntimeShape in1_shape,
+    const float* input2_data, RuntimeShape in2_shape,
+    float* output_data, RuntimeShape out_shape,
+    const bool needsBroadcast) {
+    if (needsBroadcast) {
+      BroadcastBinaryFunction4DSlow<float, float, float>(
+        in1_shape, input1_data,
+        in2_shape, input2_data,
+        out_shape, output_data,
+        [](float a, float b) { return std::max(a, b); }
+      );
+    } else {
+      auto output = MapAsVector(output_data, out_shape.FlatSize());
+      output = output.cwiseMax(MapAsVector(input2_data, out_shape.FlatSize()));
+    }
   }
 };
 
 struct Mul {
-  static inline void Call(const float* input1_data, const float* input2_data,
-                          float* output_data, Dims<4> dims) {
-    Mul_(input1_data, input2_data, output_data, FlatSize(dims));
+  static inline void Call(const float* input1_data, RuntimeShape in1_shape,
+                          const float* input2_data, RuntimeShape in2_shape,
+                          float* output_data, RuntimeShape out_shape,
+                          const bool needsBroadcast) {
+    if (needsBroadcast) {
+      BroadcastBinaryFunction4DSlow<float, float, float>(
+        in1_shape, input1_data,
+        in2_shape, input2_data,
+        out_shape, output_data,
+        [](float a, float b) { return a * b; });
+    } else {
+      Mul_(input1_data, input2_data, output_data, out_shape.FlatSize());
+    }
   }
 
   static inline void Mul_(const float* input1_data, const float* input2_data,
-                  float* output_data, const int size) {
+                          float* output_data, const int size) {
 
     int i = 0;
 #ifdef USE_NEON
@@ -112,9 +200,21 @@ struct Mul {
 //TODO maybe move to a separate file since everything else here is extracted from TF Lite
 //23.11.2018
 struct Div {
-  static inline void Call(const float* input1_data, const float* input2_data,
-                          float* output_data, Dims<4> dims) {
-    auto output = MapAsVector(output_data, dims);
-    output = output.cwiseQuotient(MapAsVector(input2_data, dims));
+  static inline void Call(
+    const float* input1_data, RuntimeShape in1_shape,
+    const float* input2_data, RuntimeShape in2_shape,
+    float* output_data, RuntimeShape out_shape,
+    bool needsBroadcast) {
+    if (needsBroadcast) {
+      BroadcastBinaryFunction4DSlow<float, float, float>(
+        in1_shape, input1_data,
+        in2_shape, input2_data,
+        out_shape, output_data,
+        [](float a, float b) { return a / b; }
+      );
+    } else {
+      auto output = MapAsVector(output_data, out_shape.FlatSize());
+      output = output.cwiseQuotient(MapAsVector(input2_data, out_shape.FlatSize()));
+    }
   }
-};
+};
\ No newline at end of file
index 8b7075e..c705d9f 100644 (file)
@@ -481,17 +481,36 @@ template <typename F, class ...Args>
 void ElementWise(Tensor &out,
   const char *params, const Args &...inputs) {
   const float *input[] = {inputs.getData()...};
+  RuntimeShape in_shapes[] = {shapeToRuntimeShape(inputs.getShape())...};
 
-  auto ins = std::tie(inputs...);
-
-  const Dims<4> out_d = shapeToDims(std::get<0>(ins).getShape());
   const int32_t num_inputs = deserializeT<int32_t>(params);
+  const bool needs_broadcast = (bool)deserializeT<int32_t>(params);
+  const Shape out_shape = deserializeShape(params);
 
-  out.reShape(std::get<0>(ins).getShape());
+  out.reShape(out_shape);
   out.fillData(input[0]);
-  for (int32_t i = 1; i < num_inputs; ++i) {
-    F::Call(out.getData(), input[i], out.getData(), out_d);
+  const auto out_rt = shapeToRuntimeShape(out_shape);
+  if (!needs_broadcast) {
+    for (int32_t i = 1; i < num_inputs; ++i) {
+      F::Call(out.getData(), out_rt,
+              input[i], in_shapes[i],
+              out.getData(), out_rt,
+              0);
+    }
+  } else {
+    auto running_shape = RuntimeShape::ExtendedShape(4, in_shapes[0]);
+    for (int32_t i = 1; i < num_inputs; ++i) {
+      float* inp_tmp = new float[running_shape.FlatSize()];
+      memcpy(inp_tmp, out.getData(), (size_t)running_shape.FlatSize() * 4);
+      F::Call(inp_tmp, running_shape,
+              input[i], in_shapes[i],
+              out.getData(), out_rt,
+              1);
+      // This modifies the running shape
+      running_shape.maxShape(RuntimeShape::ExtendedShape(4, in_shapes[i]));
+    }
   }
+
 }
 
 // TODO refactor tflite's code for this op
index 98a67fb..ad9f10c 100644 (file)
@@ -16,6 +16,7 @@
 
 #include "core/modelIR/Graph.h"
 #include "core/modelIR/operations/ReshapeOp.h"
+#include "core/modelIR/operations/ElementwiseOp.h"
 #include "core/modelIR/operations/ResizeOp.h"
 #include "core/modelIR/operations/SqueezeOp.h"
 #include "core/modelIR/operations/ReduceFOp.h"
@@ -118,6 +119,22 @@ TEST(ShapeInferenceTest, SqueezeTestAllDims) {
   ASSERT_EQ(sq1->getOutputShape(0), expected_shape);
 }
 
+TEST(ShapeInferenceTest, ElementwiseBC) {
+  Graph g;
+
+  Shape input_shape{1, 10, 10, 1};
+  Shape input2_shape{1, 1, 10, 10};
+  auto input = g.create<ops::VariableOp>("input1", input_shape);
+  auto input2 = g.create<ops::VariableOp>("input2", input2_shape);
+
+  auto add = g.create<ops::ElementwiseOp>("add_1",
+                                          std::vector<IODescriptor>{input->getOutput(0),
+                                                                    input2->getOutput(0)},
+                                          ops::ElementwiseOp::OpType::add);
+
+  ASSERT_EQ(add->getOutputShape(0), Shape({1, 10, 10, 10}));
+}
+
 TEST(ShapeInferenceTest, SqueezeTestSpecificDims) {
   Graph g;
 
index ef1dbde..c35ef89 100644 (file)
@@ -389,6 +389,79 @@ TEST(cpp_operations_test, concat)
     }
 }
 
+TEST(cpp_operations_test, add2bc) {
+  for (int numDims = 2; numDims <= 4; ++numDims) {
+    // test prerequisites
+    vector<int> shape_data1{3, 44, 5, 1};
+    vector<int> shape_data2{3, 1, 5, 6};
+    shape_data1.resize(numDims);
+    shape_data2.resize(numDims);
+    vector<Tensor> input_a_tensors(2);
+    vector<unique_ptr<mir::TensorVariant>> input_n_tensors(2);
+    fillTensors(input_n_tensors[0], input_a_tensors[0], shape_data1, 1.0f);
+    fillTensors(input_n_tensors[1], input_a_tensors[1], shape_data2, 2.0f);
+    auto op_generator = [](mir::Graph& g, const std::vector<mir::IODescriptor>& inputs) {
+      return g.create<mir::ops::ElementwiseOp>("y", inputs, mir::ops::ElementwiseOp::OpType::add);
+    };
+
+    createAndRunTestGraph(op_generator, ElementWise<Add, Tensor, Tensor>, input_n_tensors,
+                          input_a_tensors[0],
+                          input_a_tensors[1]);
+  }
+}
+
+TEST(cpp_operations_test, mul3bc) {
+  for (int numDims = 2; numDims <= 4; ++numDims) {
+    // test prerequisites
+    vector<int> shape_data1{3, 22, 5, 1};
+    vector<int> shape_data2{3, 1, 5, 6};
+    vector<int> shape_data3{1, 22, 1, 6};
+    shape_data1.resize(numDims);
+    shape_data2.resize(numDims);
+    shape_data3.resize(numDims);
+    vector<Tensor> input_a_tensors(3);
+    vector<unique_ptr<mir::TensorVariant>> input_n_tensors(3);
+    fillTensors(input_n_tensors[0], input_a_tensors[0], shape_data1, 1.0f);
+    fillTensors(input_n_tensors[1], input_a_tensors[1], shape_data2, 2.0f);
+    fillTensors(input_n_tensors[2], input_a_tensors[2], shape_data3, 3.0f);
+    auto opGenerator = [](mir::Graph& g, const std::vector<mir::IODescriptor>& inputs) {
+      return g.create<mir::ops::ElementwiseOp>("y", inputs, mir::ops::ElementwiseOp::OpType::mul);
+    };
+
+    createAndRunTestGraph(opGenerator, ElementWise<Mul, Tensor, Tensor, Tensor>, input_n_tensors,
+                          input_a_tensors[0],
+                          input_a_tensors[1], input_a_tensors[2]);
+  }
+}
+
+TEST(cpp_operations_test, div3bc) {
+  for (int numDims = 2; numDims <= 4; ++numDims) {
+    // test prerequisites
+    vector<int> shape_data1{3, 22, 5, 1};
+    vector<int> shape_data2{3, 1, 5, 6};
+    vector<int> shape_data3{1, 22, 1, 6};
+    shape_data1.resize(numDims);
+    shape_data2.resize(numDims);
+    shape_data3.resize(numDims);
+    vector<Tensor> input_a_tensors(3);
+    vector<unique_ptr<mir::TensorVariant>> input_n_tensors(3);
+    fillTensors(input_n_tensors[0], input_a_tensors[0], shape_data1, 5.0f);
+    fillTensors(input_n_tensors[1], input_a_tensors[1], shape_data2, 2.0f);
+    fillTensors(input_n_tensors[2], input_a_tensors[2], shape_data3, 3.0f);
+    auto opGenerator = [](mir::Graph& g, const std::vector<mir::IODescriptor>& inputs) {
+      return g.create<mir::ops::ElementwiseOp>("y", inputs, mir::ops::ElementwiseOp::OpType::div);
+    };
+
+    createAndRunTestGraph(
+      opGenerator, ElementWise<Div, Tensor, Tensor, Tensor>,
+      input_n_tensors,
+      input_a_tensors[0],
+      input_a_tensors[1],
+      input_a_tensors[2]
+    );
+  }
+}
+
 
 TEST(cpp_operations_test, add2) {
   for (int numDims = 2; numDims <= 4; ++numDims) {
@@ -403,7 +476,8 @@ TEST(cpp_operations_test, add2) {
       return g.create<mir::ops::ElementwiseOp>("y", inputs, mir::ops::ElementwiseOp::OpType::add);
     };
 
-    createAndRunTestGraph(op_generator, ElementWise<Add,Tensor,Tensor>, input_n_tensors, input_a_tensors[0],
+    createAndRunTestGraph(op_generator, ElementWise<Add, Tensor, Tensor>, input_n_tensors,
+                          input_a_tensors[0],
                           input_a_tensors[1]);
   }
 }
@@ -422,7 +496,8 @@ TEST(cpp_operations_test, mul3) {
       return g.create<mir::ops::ElementwiseOp>("y", inputs, mir::ops::ElementwiseOp::OpType::mul);
     };
 
-    createAndRunTestGraph(opGenerator, ElementWise<Mul,Tensor,Tensor,Tensor>, input_n_tensors, input_a_tensors[0],
+    createAndRunTestGraph(opGenerator, ElementWise<Mul, Tensor, Tensor, Tensor>, input_n_tensors,
+                          input_a_tensors[0],
                           input_a_tensors[1], input_a_tensors[2]);
   }
 }
@@ -442,7 +517,8 @@ TEST(cpp_operations_test, max4) {
       return g.create<mir::ops::ElementwiseOp>("y", inputs, mir::ops::ElementwiseOp::OpType::max);
     };
 
-    createAndRunTestGraph(opGenerator, ElementWise<Max,Tensor,Tensor,Tensor,Tensor>, input_n_tensors, input_a_tensors[0],
+    createAndRunTestGraph(opGenerator, ElementWise<Max, Tensor, Tensor, Tensor, Tensor>,
+                          input_n_tensors, input_a_tensors[0],
                           input_a_tensors[1], input_a_tensors[2], input_a_tensors[3]);
   }
 }
@@ -490,8 +566,7 @@ TEST(cpp_operations_test, conv2d)
       for (iT inputC = 1; inputC <= 3; ++inputC)
         for (iT outputC = 1; outputC <= 3; ++outputC)
           for (iT strideH = 1; strideH <= 3; ++strideH)
-            for (iT strideW = 1; strideW <= 3; ++strideW)
-            {
+            for (iT strideW = 1; strideW <= 3; ++strideW) {
               vector<int> inputShapeData{1, 5, 7, static_cast<int>(inputC)};  // NHWC
               mir::Shape kernelShape{kernelH, kernelW, inputC, outputC}; // HWCN
               mir::Shape strides{strideH, strideW};
@@ -523,8 +598,7 @@ TEST(cpp_operations_tests, depthwise_conv)
       for (iT channels = 1; channels <= 3; ++channels)
         for (iT strideW = 1; strideW <= 3; ++strideW)
           for (iT strideH = 1; strideH <= 3; ++strideH)
-            for (iT multiplier = 1; multiplier <= 2; ++multiplier)
-            {
+            for (iT multiplier = 1; multiplier <= 2; ++multiplier) {
               vector<int> inputShapeData{1, 5, 7, static_cast<int>(channels)};  // NHWC
               mir::Shape kernelShape{kernelH, kernelW, channels, multiplier}; // HWCN
               mir::Shape strides{strideH, strideW};
@@ -570,8 +644,7 @@ static void genericPoolTest(Func testFunc, const vector<irOps::PoolOp::BorderTyp
     for (iT windowW = 1; windowW <= 3; ++windowW)
       for (iT channels = 1; channels <= 2; ++channels)
         for (iT strideH = 1; strideH <= 3; ++strideH)
-          for (iT strideW = 1; strideW <= 3; ++strideW)
-          {
+          for (iT strideW = 1; strideW <= 3; ++strideW) {
             vector<int> shapeData{1, 5, 7, static_cast<int>(channels)};
             mir::Shape windowShape{windowH, windowW};
             mir::Shape strides{strideH, strideW};
@@ -597,24 +670,24 @@ static void genericPoolTest(Func testFunc, const vector<irOps::PoolOp::BorderTyp
 TEST(cpp_operations_test, maxpool)
 {
   vector<irOps::PoolOp::BorderType> borderTypes{
-                                                irOps::PoolOp::BorderType::EMPTY
-                                               };
+    irOps::PoolOp::BorderType::EMPTY
+  };
   genericPoolTest<mir::ops::PoolOp::PoolingType::MAX>(maxPool, borderTypes);
 }
 
 TEST(cpp_operations_test, avgpool)
 {
   vector<irOps::PoolOp::BorderType> borderTypes{
-                                                irOps::PoolOp::BorderType::EMPTY,
-                                                irOps::PoolOp::BorderType::ZEROFILLED
-                                               };
+    irOps::PoolOp::BorderType::EMPTY,
+    irOps::PoolOp::BorderType::ZEROFILLED
+  };
   genericPoolTest<mir::ops::PoolOp::PoolingType::AVG>(avgPool, borderTypes);
 }
 
 TEST(cpp_operations_test, relu)
 {
   // test prerequisites
-  vector<int> shapeData{2,3,4,5};
+  vector<int> shapeData{2, 3, 4, 5};
   Tensor aInputTensor;
   vector<unique_ptr<mir::TensorVariant>> inputNTensors(1);
   fillTensors(inputNTensors[0], aInputTensor, shapeData, 1.0f);
@@ -627,7 +700,7 @@ TEST(cpp_operations_test, relu)
 
 TEST(cpp_operations_test, elu) {
   // test prerequisites
-  vector<int> shape_data{2,3,4,5};
+  vector<int> shape_data{2, 3, 4, 5};
   Tensor a_input_tensor;
   vector<unique_ptr<mir::TensorVariant>> input_n_tensors(1);
   fillTensors(input_n_tensors[0], a_input_tensor, shape_data, 1.0f);
@@ -640,7 +713,7 @@ TEST(cpp_operations_test, elu) {
 
 TEST(cpp_operations_test, tanh) {
   // test prerequisites
-  vector<int> shape_data{2,3,4,5};
+  vector<int> shape_data{2, 3, 4, 5};
   Tensor a_input_tensor;
   vector<unique_ptr<mir::TensorVariant>> input_n_tensors(1);
   fillTensors(input_n_tensors[0], a_input_tensor, shape_data, 1.0f);
@@ -686,8 +759,7 @@ TEST(cpp_operations_test, reduceMeanTst) {
 TEST(cpp_operations_test, softmax)
 {
   // iterate over number of dimensions in tensor
-  for (int numDims = 1; numDims <= 4; ++numDims)
-  {
+  for (int numDims = 1; numDims <= 4; ++numDims) {
     // test prerequisites
     vector<int> shapeData{2, 3, 4, 5};
     shapeData.resize(numDims);
@@ -706,8 +778,8 @@ TEST(cpp_operations_test, softmax)
 TEST(cpp_operations_test, reshape)
 {
   // test prerequisites
-  vector<int> inputShapeData{2,3,4,5};
-  vector<int> outputShapeData{1,120};
+  vector<int> inputShapeData{2, 3, 4, 5};
+  vector<int> outputShapeData{1, 120};
   mir::Shape nOutputShape;
   fillNShape(nOutputShape, outputShapeData);
   Tensor aInputTensor;