From 6f71ec74d3e1b36aa15467100cab2a29f5d355d5 Mon Sep 17 00:00:00 2001
From: =?utf8?q?Vladimir=20Plazun/AI=20Tools=20Lab=20/SRR/Engineer/?=
 =?utf8?q?=EC=82=BC=EC=84=B1=EC=A0=84=EC=9E=90?= <v.plazun@samsung.com>
Date: Mon, 28 Jan 2019 18:17:34 +0300
Subject: [PATCH] [nnc] Increase interpreter speed (#2768)

Remove vector usage from Shapes and Indices
Move all getters into headers( inlining )
Increase inference speed up to 4x-6x times
Tinker with Deconv2D implementation to speed it up
Fix issue in broadcasting implementation

Signed-off-by: Vladimir Plazun v.plazun@partner.samsung.com
---
 contrib/nnc/core/modelIR/Index.cpp               |  20 +---
 contrib/nnc/core/modelIR/Shape.cpp               |  12 --
 contrib/nnc/core/modelIR/TensorVariant.cpp       |  29 +----
 contrib/nnc/include/ADT/SmallVector.h            | 142 +++++++++++++++++++++++
 contrib/nnc/include/core/modelIR/Common.h        |  31 +++++
 contrib/nnc/include/core/modelIR/Index.h         |  62 +++++++---
 contrib/nnc/include/core/modelIR/Shape.h         |  35 ++++--
 contrib/nnc/include/core/modelIR/ShapeRange.h    |   6 +-
 contrib/nnc/include/core/modelIR/TensorVariant.h |  28 +++--
 contrib/nnc/passes/interpreter/ops/DeConv2D.cpp  |  43 ++++---
 10 files changed, 302 insertions(+), 106 deletions(-)
 create mode 100644 contrib/nnc/include/ADT/SmallVector.h
 create mode 100644 contrib/nnc/include/core/modelIR/Common.h
diff --git a/contrib/nnc/core/modelIR/Index.cpp b/contrib/nnc/core/modelIR/Index.cpp
index ebeb182..7f4c30f 100644
--- a/contrib/nnc/core/modelIR/Index.cpp
+++ b/contrib/nnc/core/modelIR/Index.cpp
@@ -17,37 +17,27 @@
 #include "core/modelIR/Index.h"
 
 #include <algorithm>
+#include <cassert>
 
 namespace nnc {
 namespace mir {
 
-Index::Index(std::initializer_list<int32_t>&& l) : _indices{l} {
-  // DO NOTHING
-}
-
-int32_t Index::rank(void) const { return _indices.size(); }
-
 Index& Index::resize(int32_t size) {
   _indices.resize(size);
   return *this;
 }
 
 Index& Index::fill(int32_t index) {
-  std::fill(_indices.begin(), _indices.end(), index);
+  std::fill(std::begin(_indices), std::end(_indices), index);
   return (*this);
 }
 
-int32_t& Index::at(int32_t axis) { return _indices[(axis < 0) ? (_indices.size() + axis) : axis]; }
-int32_t Index::at(int32_t axis) const {
-  return _indices[(axis < 0) ? (_indices.size() + axis) : axis];
-}
-
-std::ostream& operator<<(std::ostream& s, const Index& sh) {
+std::ostream& operator<<(std::ostream& s, const Index& idx) {
   s << "[ ";
-  for (int32_t i = 0; i < sh.rank(); ++i) {
+  for (int32_t i = 0; i < idx.rank(); ++i) {
     if (i != 0)
       s << ", ";
-    s << sh.at(i);
+    s << idx.at(i);
   }
   s << "]";
 
diff --git a/contrib/nnc/core/modelIR/Shape.cpp b/contrib/nnc/core/modelIR/Shape.cpp
index 99765ff..af2a154 100644
--- a/contrib/nnc/core/modelIR/Shape.cpp
+++ b/contrib/nnc/core/modelIR/Shape.cpp
@@ -28,18 +28,6 @@ void Shape::resize(int32_t size) {
   _dims.resize(size);
 }
 
-int32_t Shape::dim(int32_t axis) const {
-  auto dim = (axis < 0) ? (_dims.size() + axis) : axis;
-  assert(dim < _dims.size());
-  return _dims.at(dim);
-}
-
-int32_t& Shape::dim(int32_t axis) {
-  auto dim = (axis < 0) ? (_dims.size() + axis) : axis;
-  assert(dim < _dims.size());
-  return _dims.at(dim);
-}
-
 int32_t Shape::numElements() const {
   if (rank() == 0) {
     return 0;
diff --git a/contrib/nnc/core/modelIR/TensorVariant.cpp b/contrib/nnc/core/modelIR/TensorVariant.cpp
index f0dbb53..b2722aa 100644
--- a/contrib/nnc/core/modelIR/TensorVariant.cpp
+++ b/contrib/nnc/core/modelIR/TensorVariant.cpp
@@ -23,7 +23,7 @@ namespace mir
 {
 
 TensorVariant::TensorVariant(DTYPE dtype, const Shape& shape)
-    : _dtype(dtype), _strides{0}, _rank(shape.rank()), _shape(shape) {
+    : _dtype(dtype), _shape(shape), _strides(shape.rank()) {
   switch (dtype) {
     case DTYPE::FLOAT32:
       _elementSize = sizeof(float);
@@ -44,7 +44,7 @@ TensorVariant::TensorVariant(DTYPE dtype, const Shape& shape)
   _data.reset(new char[data_size], std::default_delete<char[]>());
 
   int stride = 1;
-  for (int d = _rank - 1; d >= 0; --d)
+  for (int d = _shape.rank() - 1; d >= 0; --d)
   {
     _strides[d] = stride;
     stride *= _shape.dim(d);
@@ -65,10 +65,10 @@ TensorVariant::TensorVariant(DTYPE dtype, const Shape& shape, const void* data)
  */
 TensorVariant::TensorVariant(const TensorVariant& t_old,
                              const Shape& shape)
-  : _dtype(t_old._dtype), _data(t_old._data), _strides{0}, _rank(shape.rank()),
+  : _dtype(t_old._dtype), _data(t_old._data), _strides(static_cast<size_t>(shape.rank())),
     _shape(shape), _elementSize(t_old._elementSize) {
-  int axis_old = t_old._rank - 1;
-  for (int d = _rank - 1; d >= 0; d--) {
+  int axis_old = t_old._shape.rank() - 1;
+  for (int d = shape.rank() - 1; d >= 0; d--) {
     if (t_old._shape.dim(axis_old) == 1)
       _strides[d] = 0;
     else
@@ -79,24 +79,5 @@ TensorVariant::TensorVariant(const TensorVariant& t_old,
   }
 }
 
-char* TensorVariant::at(const Index& idx) const {
-  return _data.get() + getOffset(idx) * _elementSize;
-}
-
-char* TensorVariant::atOffset(int32_t offset) const {
-  assert(offset >= 0 && offset < getShape().numElements());
-  return _data.get() + offset * _elementSize;
-}
-
-size_t TensorVariant::getOffset(const Index &idx) const {
-  assert(idx.rank() == getShape().rank());
-  std::size_t offset = 0;
-  for (size_t i = 0; i < _rank; ++i)
-  {
-    offset += idx.at(i) * _strides[i];
-  }
-  return offset;
-}
-
 } // namespace mir
 } // namespace nnc
diff --git a/contrib/nnc/include/ADT/SmallVector.h b/contrib/nnc/include/ADT/SmallVector.h
new file mode 100644
index 0000000..3f09ad7
--- /dev/null
+++ b/contrib/nnc/include/ADT/SmallVector.h
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _NNC_CORE_SMALL_VECTOR_H
+#define _NNC_CORE_SMALL_VECTOR_H
+
+#include <cassert>
+#include <iterator>
+#include <initializer_list>
+
+namespace nnc {
+namespace adt {
+
+/**
+ * @brief vector with cheap memory allocation
+ * @tparam T type of elements
+ * @tparam Capacity maximum number of elements
+ * @note much like std::array, but tracks number of used elements. Stored in stack
+ */
+template <typename T, size_t Capacity>
+class small_vector {
+public:
+  using value_type = T;
+  using reference = T&;
+  using iterator = T*;
+  using size_type = size_t;
+
+  template <typename It>
+  small_vector(It begin, It end) : _size(std::distance(begin, end)) {
+    assert(_size <= Capacity);
+    std::copy(begin, end, this->begin());
+  }
+
+  explicit small_vector(size_t size, value_type initializer = value_type()) : _size(size) {
+    assert(_size <= Capacity);
+    std::fill(begin(), end(), initializer);
+  }
+
+  explicit small_vector() : _size(0) {}
+
+  small_vector(std::initializer_list<value_type>&& l) : _size(l.size()) {
+    assert(_size <= Capacity);
+    std::copy(std::begin(l), std::end(l), begin());
+  }
+
+  /**
+   * @return current size
+   */
+  inline size_t size() const noexcept {
+    return _size;
+  }
+
+  /**
+   * @return maximum number of elements this vector can hold
+   */
+  constexpr size_t capacity() const {
+    return Capacity;
+  }
+
+  /**
+  * @brief resize to given new size
+  * @note if new size is greater than current size, new elements are default-initialized
+  */
+  void resize(size_t new_size) noexcept {
+    assert(new_size <= Capacity);
+    if (new_size > _size) {
+      std::fill(_storage + _size, _storage + new_size, T());
+    }
+    _size = new_size;
+  }
+
+  /**
+   * @return reference to the element at position idx
+   */
+  inline reference operator[](size_t idx) noexcept {
+    assert(idx >= 0 && idx < _size);
+    return _storage[idx];
+  }
+
+  /**
+   * @return value of element at position idx
+   */
+  inline constexpr value_type operator[](size_t idx) const noexcept {
+    //assert on the same line since c++11 does not allow multi-line constexpr functions
+    return assert(idx >= 0 && idx < _size), _storage[idx];
+  }
+
+  inline iterator begin() noexcept {
+    return std::begin(_storage);
+  }
+
+  inline iterator end() noexcept {
+    return _storage + _size;
+  }
+
+  inline void push_back(const value_type& e) noexcept {
+    assert(_size < Capacity);
+    _storage[_size++] = e;
+  }
+
+  inline void push_back(value_type&& e) noexcept {
+    assert(_size < Capacity);
+    _storage[_size++] = std::move(e);
+  }
+
+private:
+  size_t _size;
+  value_type _storage[Capacity];
+};
+
+template <typename T, size_t LCapacity, size_t RCapacity>
+bool operator==(const small_vector<T, LCapacity>& lhs, const small_vector<T, RCapacity>& rhs) {
+  if (lhs.size() != rhs.size()) {
+    return false;
+  }
+
+  bool equal = true;
+  size_t end = lhs.size();
+  for (size_t i = 0; i < end; ++i) {
+    equal &= (lhs[i] == rhs[i]);
+  }
+
+  return equal;
+}
+
+} // namespace adt
+} // namespace nnc
+
+#endif //_NNC_CORE_SMALL_VECTOR_H
diff --git a/contrib/nnc/include/core/modelIR/Common.h b/contrib/nnc/include/core/modelIR/Common.h
new file mode 100644
index 0000000..58e1dd0
--- /dev/null
+++ b/contrib/nnc/include/core/modelIR/Common.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _NNC_CORE_COMMON_H_
+#define _NNC_CORE_COMMON_H_
+
+namespace nnc {
+/**
+ * @brief maximum number of dimensions what an Index, Shape or Tensor can have
+ */
+  constexpr size_t MAX_DIMENSION_COUNT = 8;
+
+  inline constexpr size_t wrap_index(int32_t index, size_t limit) noexcept {
+    return static_cast<size_t>(index >= 0 ? index : limit + index);
+  }
+}
+
+#endif // _NNC_CORE_COMMON_H_
diff --git a/contrib/nnc/include/core/modelIR/Index.h b/contrib/nnc/include/core/modelIR/Index.h
index 931e11a..d6ed3d7 100644
--- a/contrib/nnc/include/core/modelIR/Index.h
+++ b/contrib/nnc/include/core/modelIR/Index.h
@@ -18,36 +18,66 @@
 #define _NNC_CORE_LINALG_INDEX_H_
 
 #include <initializer_list>
-#include <vector>
 #include <cstdint>
 #include <ostream>
 
-namespace nnc
-{
-namespace mir
-{
+#include "core/modelIR/Common.h"
 
-class Index
-{
+#include "ADT/SmallVector.h"
+
+namespace nnc {
+namespace mir {
+
+class Index {
 public:
   Index() = default;
-  Index(std::initializer_list<int32_t>&& l);
-  Index(std::vector<int32_t>&& vec);
 
-  int32_t rank(void) const;
+  Index(std::initializer_list<int32_t>&& l) noexcept : _indices(std::move(l)) {
+    // DO NOTHING
+  }
+
+  Index(size_t rank) : _indices(rank) {}
+
+  int32_t rank() const { return static_cast<int32_t>(_indices.size()); }
+
+
+  /**
+   * @brief resize index to given dimension number
+   * @param size new number of dimensions
+   * @return *this
+   * @warning if new size is greater than old, new dimensions are undefined
+   */
+  Index& resize(int32_t size);
 
-  Index &resize(int32_t size);
+  /**
+   * @brief fill all axis with `index`
+   * @return `*this`
+   */
+  Index& fill(int32_t index);
 
-  Index &fill(int32_t index);
+  /**
+   * @brief return position on given axis
+   * @param axis index of axis to get index on. If axis is negative returns axis from the last
+   * @return
+   */
+  int32_t& at(int32_t axis) {
+    return _indices[wrap_index(axis, _indices.size())];
+  }
 
-  int32_t &at(int32_t axis);
-  int32_t at(int32_t axis) const;
+  /**
+   * @brief return position on given axis
+   * @param axis index of axis to get index on. If axis is negative returns axis from the last
+   * @return
+   */
+  int32_t at(int32_t axis) const {
+    return _indices[wrap_index(axis, _indices.size())];
+  }
 
 private:
-  std::vector<int32_t> _indices;
+  adt::small_vector<int32_t, MAX_DIMENSION_COUNT> _indices;
 };
 
-std::ostream &operator<<(std::ostream &s, const Index &sh);
+std::ostream &operator<<(std::ostream &s, const Index &idx);
 
 } // namespace mir
 } // namespace nnc
diff --git a/contrib/nnc/include/core/modelIR/Shape.h b/contrib/nnc/include/core/modelIR/Shape.h
index f495270..13028d3 100644
--- a/contrib/nnc/include/core/modelIR/Shape.h
+++ b/contrib/nnc/include/core/modelIR/Shape.h
@@ -21,6 +21,10 @@
 #include <vector>
 #include <cstdint>
 #include <ostream>
+#include <cassert>
+
+#include "ADT/SmallVector.h"
+#include "core/modelIR/Common.h"
 
 namespace nnc {
 namespace mir {
@@ -31,28 +35,40 @@ public:
 
   Shape() = default;
 
-  explicit Shape(int32_t rank) : _dims(static_cast<decltype(_dims)::size_type>(rank)) {}
+  explicit Shape(int32_t rank) : _dims(rank) {}
 
-  Shape(std::initializer_list<int32_t> dims) : _dims(dims) {}
+  Shape(std::initializer_list<int32_t>&& dims) : _dims(std::move(dims)) {}
 
-  explicit Shape(const std::vector<int32_t>& dims) : _dims(dims) {}
+  explicit Shape(const std::vector<int32_t>& dims) : _dims(std::begin(dims), std::end(dims)) {}
 
-  int32_t rank() const { return static_cast<int32_t>(_dims.size()); }
+  int32_t rank() const {
+    return static_cast<int32_t>(_dims.size());
+  }
 
   void resize(int32_t size);
 
-  int32_t& dim(int32_t axis);
+  int32_t& dim(int32_t axis) noexcept {
+    auto dim = wrap_index(axis, _dims.size());
+    return _dims[dim];
+  };
 
-  int32_t dim(int32_t axis) const;
+  int32_t dim(int32_t axis) const noexcept {
+    auto dim = wrap_index(axis, _dims.size());
+    return _dims[dim];
+  }
 
   int32_t numElements() const;
 
-  bool operator==(const Shape& rhs) const { return _dims == rhs._dims; }
+  bool operator==(const Shape& rhs) const {
+    return _dims == rhs._dims;
+  }
 
-  bool operator!=(const Shape& rhs) const { return _dims != rhs._dims; }
+  bool operator!=(const Shape& rhs) const {
+    return !(*this  == rhs);
+  }
 
 private:
-  std::vector<int32_t> _dims;
+  adt::small_vector<int32_t, MAX_DIMENSION_COUNT> _dims;
 };
 
 std::ostream& operator<<(std::ostream& s, const Shape& sh);
@@ -61,3 +77,4 @@ std::ostream& operator<<(std::ostream& s, const Shape& sh);
 } // namespace nnc
 
 #endif // _NNC_CORE_LINALG_SHAPE_H_
+
diff --git a/contrib/nnc/include/core/modelIR/ShapeRange.h b/contrib/nnc/include/core/modelIR/ShapeRange.h
index 12bf83b..e49a235 100644
--- a/contrib/nnc/include/core/modelIR/ShapeRange.h
+++ b/contrib/nnc/include/core/modelIR/ShapeRange.h
@@ -36,7 +36,7 @@ class ShapeIter :
     int32_t rank = _shape.rank();
     int32_t c = rank - 1;
     pidx[c]++;
-    while( (pidx[c] > pshape[c] - 1) && (c > 0) ) {
+    while (pidx[c] >= pshape[c] && c > 0 ) {
       pidx[c] = 0;
       pidx[--c]++;
     }
@@ -44,7 +44,7 @@ class ShapeIter :
     return *this;
   }
 
-  ShapeIter operator++(int) {
+  const ShapeIter operator++(int) {
     ShapeIter it = *this;
     ++*this;
     return it;
@@ -54,7 +54,7 @@ class ShapeIter :
     return _index;
   }
 
-  bool operator!=(ShapeIter& iter) {
+  bool operator!=(const ShapeIter& iter) const {
     assert(iter._index.rank() == _index.rank());
     assert(iter._shape == _shape);
     return _pos != iter._pos;
diff --git a/contrib/nnc/include/core/modelIR/TensorVariant.h b/contrib/nnc/include/core/modelIR/TensorVariant.h
index 8544c1f..8bf3a1d 100644
--- a/contrib/nnc/include/core/modelIR/TensorVariant.h
+++ b/contrib/nnc/include/core/modelIR/TensorVariant.h
@@ -24,14 +24,13 @@
 #include "core/modelIR/Index.h"
 #include "core/modelIR/Shape.h"
 #include "core/modelIR/DataType.h"
+#include "core/modelIR/Common.h"
 
 namespace nnc
 {
 namespace mir
 {
 
-constexpr int MAX_DIMENSIONS = 32;
-
 class TensorVariant {
 public:
   TensorVariant(DTYPE dtype, const Shape& shape);
@@ -42,20 +41,31 @@ public:
 
   virtual ~TensorVariant() = default;
 
-  char* at(const Index& idx) const;
-  char* atOffset(int32_t offset) const;
-  size_t getOffset(const Index &idx) const;
+  char* at(const Index& idx) const {
+    return _data.get() + getOffset(idx) * _elementSize;
+  }
 
-  virtual const Shape &getShape() const { return _shape; }
-  DTYPE getDataType() const { return _dtype; }
+  char* atOffset(int32_t offset) const {
+    assert(offset >= 0 && offset < getShape().numElements());
+    return _data.get() + offset * _elementSize;
+  }
 
+  size_t getOffset(const Index &idx) const {
+    assert(idx.rank() == getShape().rank());
+    std::size_t offset = 0;
+    for (int i = 0; i < _shape.rank(); ++i)
+      offset += idx.at(i) * _strides[i];
+    return offset;
+  }
+
+  const Shape &getShape() const { return _shape; }
+  DTYPE getDataType() const { return _dtype; }
   size_t getElementSize() const { return _elementSize; }
 
  private:
   DTYPE _dtype;
   std::shared_ptr<char> _data;
-  int_fast32_t _strides[MAX_DIMENSIONS];
-  size_t _rank;
+  adt::small_vector<int_fast32_t, MAX_DIMENSION_COUNT> _strides;
   Shape _shape;
 
   size_t _elementSize;
diff --git a/contrib/nnc/passes/interpreter/ops/DeConv2D.cpp b/contrib/nnc/passes/interpreter/ops/DeConv2D.cpp
index 5b0afd8..8208613 100644
--- a/contrib/nnc/passes/interpreter/ops/DeConv2D.cpp
+++ b/contrib/nnc/passes/interpreter/ops/DeConv2D.cpp
@@ -52,42 +52,49 @@ std::vector<nnc::mir::TensorVariant> nnc::DeConv2D::operator()() {
   Index input_idx;
   input_idx.resize(in_shape.rank());
 
-  for (auto &out_idx : out_range) {
-    for (auto &kernel_idx_r : kernel_range) {
-      auto kernel_idx = kernel_idx_r;
+  Index kernel_idx;
+  kernel_idx.resize(k_shape.rank());
 
+  for (auto& out_idx : out_range) {
+    auto out_region = res_accesor.getRegion(out_idx);
+    assert(out_region.size() == num_kernels);
+
+    for (auto& kernel_idx_r : kernel_range) {
       // rotate kernel 180 deg around last axis
       // by index transform
       for (int32_t d = 0; d < 2; ++d) {
-        kernel_idx.at(d) = kernel.getShape().dim(d) - kernel_idx.at(d) - 1;
+        kernel_idx.at(d) = kernel.getShape().dim(d) - kernel_idx_r.at(d) - 1;
       }
+      kernel_idx.at(2) = kernel_idx_r.at(2);
+      kernel_idx.at(3) = kernel_idx_r.at(3);
 
       // flag that keeps info on whether the current input element is from input
       // or is from dilation by stride
       bool is_from_input = true;
       for (int32_t d = 1; d < input_idx.rank() - 1; ++d) {
         const auto num = (out_idx.at(d) + pads.at(d - 1) - kernel_idx.at(d - 1));
-        const auto div_res = num / strides.dim(d - 1);
-        const auto rem = num % strides.dim(d - 1);
+        auto stride = strides.dim(d - 1 );
+        const auto div_res = num / stride;
+        const auto rem = num - div_res*stride;
         is_from_input = is_from_input && rem == 0;
         if (rem != 0) break;
         input_idx.at(d) = div_res;
       }
-      // batch is same as output's
-      input_idx.at(0) = out_idx.at(0);
-      // channel index - same as kernel's
-      input_idx.at(3) = kernel_idx.at(2);
+      if (is_from_input) {
+        // batch is same as output's
+        input_idx.at(0) = out_idx.at(0);
+        // channel index - same as kernel's
+        input_idx.at(3) = kernel_idx.at(2);
 
-      if (in_range.contains(input_idx) and is_from_input) {
-        auto kernel_region = kernel.getRegion(kernel_idx);
-        assert( kernel_region.size() == num_kernels );
+        if (in_range.contains(input_idx)) {
+          auto kernel_region = kernel.getRegion(kernel_idx);
+          assert(kernel_region.size() == num_kernels);
 
-        auto out_region = res_accesor.getRegion(out_idx);
-        assert( out_region.size() == num_kernels );
-        auto in = _input.at(input_idx);
+          auto in = _input.at(input_idx);
 
-        for (int32_t kernel_index = 0; kernel_index < num_kernels; kernel_index++) {
-          out_region.base()[kernel_index] += in * kernel_region.base()[kernel_index];
+          for (int32_t kernel_index = 0; kernel_index < num_kernels; kernel_index++) {
+            out_region.base()[kernel_index] += in * kernel_region.base()[kernel_index];
+          }
         }
       }
     }
-- 
2.7.4