From: Андрей Шедько/AI Tools Lab /SRR/Engineer/삼성전자 <a.shedko@samsung.com>
Date: Mon, 21 Jan 2019 15:37:23 +0000 (+0300)
Subject: [nnc] Conv refactor (#2886)
X-Git-Tag: nncc_backup~938
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=1a8eac738cd6a169e44244fdff2d349f2afe2890;p=platform%2Fcore%2Fml%2Fnnfw.git

[nnc] Conv refactor (#2886)

Cleaned up `Conv2d` and `ConvTranspose`;
Updated Conv2d with upstream implementation

Signed-off-by: Andrei Shedko <a.shedko@samsung.com>
---

diff --git a/contrib/nnc/passes/soft_backend/code_snippets/cpp_common_funcs.def b/contrib/nnc/passes/soft_backend/code_snippets/cpp_common_funcs.def
index e1b6caa..30e490c 100644
--- a/contrib/nnc/passes/soft_backend/code_snippets/cpp_common_funcs.def
+++ b/contrib/nnc/passes/soft_backend/code_snippets/cpp_common_funcs.def
@@ -267,8 +267,8 @@ inline int Offset(const Dims<4>& dims, int i0, int i1, int i2, int i3) {
   TFLITE_DCHECK(i1 >= 0 && i1 < dims.sizes[1]);
   TFLITE_DCHECK(i2 >= 0 && i2 < dims.sizes[2]);
   TFLITE_DCHECK(i3 >= 0 && i3 < dims.sizes[3]);
-  return i0 * dims.strides[0] + i1 * dims.strides[1] + i2 * dims.strides[2] +
-         i3 * dims.strides[3];
+  return i0 * dims.strides[0] + i1 * dims.strides[1] +
+         i2 * dims.strides[2] + i3 * dims.strides[3];
 }
 
 // Gets next index to iterate through a multidimensional array.
@@ -590,7 +590,7 @@ struct PaddingValues {
 };
 
 struct ConvParams {
-  PaddingType padding_type;
+  //  PaddingType padding_type;
   PaddingValues padding_values;
   // TODO(starka): This was just "stride", so check that width+height is OK.
   int16 stride_width;
@@ -614,6 +614,32 @@ struct ConvParams {
    */
 };
 
+
+struct DepthwiseParams {
+  //PaddingType padding_type;
+  PaddingValues padding_values;
+  int16 stride_width;
+  int16 stride_height;
+  int16 dilation_width_factor;
+  int16 dilation_height_factor;
+  int16 depth_multiplier;
+  /*
+  // uint8 inference params.
+  // TODO(b/65838351): Use smaller types if appropriate.
+  int32 input_offset;
+  int32 weights_offset;
+  int32 output_offset;
+  int32 output_multiplier;
+  int output_shift;
+  // uint8, etc, activation params.
+  int32 quantized_activation_min;
+  int32 quantized_activation_max;
+  // float activation params.
+  float float_activation_min;
+  float float_activation_max;
+  */
+};
+
 inline int Offset(const RuntimeShape& shape, int i0, int i1, int i2, int i3) {
   TFLITE_DCHECK_EQ(shape.DimensionsCount(), 4);
   const int* dims_data = shape.DimsDataUpTo4D();
diff --git a/contrib/nnc/passes/soft_backend/code_snippets/cpp_conv.def b/contrib/nnc/passes/soft_backend/code_snippets/cpp_conv.def
index be51b99..4fef4dc 100644
--- a/contrib/nnc/passes/soft_backend/code_snippets/cpp_conv.def
+++ b/contrib/nnc/passes/soft_backend/code_snippets/cpp_conv.def
@@ -14,12 +14,15 @@ limitations under the License.
 ==============================================================================*/
 
 template <typename T>
-inline void ExtractPatchIntoBufferColumn(
-    const Dims<4>& input_dims, int w, int h, int b, int kheight, int kwidth,
-    int stride_width, int stride_height, int pad_width, int pad_height,
-    int in_width, int in_height, int in_depth, int single_buffer_length,
-    int buffer_id, const T* in_data, T* conv_buffer_data, uint8 byte_zero) {
-
+inline void ExtractPatchIntoBufferColumn(const RuntimeShape& input_shape, int w,
+                                         int h, int b, int kheight, int kwidth,
+                                         int stride_width, int stride_height,
+                                         int pad_width, int pad_height,
+                                         int in_width, int in_height,
+                                         int in_depth, int single_buffer_length,
+                                         int buffer_id, const T* in_data,
+                                         T* conv_buffer_data, uint8 zero_byte) {
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
   // This chunk of code reshapes all the inputs corresponding to
   // output (b, h, w) to a column vector in conv_buffer(:, buffer_id).
   const int kwidth_times_indepth = kwidth * in_depth;
@@ -37,11 +40,11 @@ inline void ExtractPatchIntoBufferColumn(
   const int ih_start = std::max(0, ih_ungated_start);
   const int iw_start = std::max(0, iw_ungated_start);
   const int single_row_num =
-      std::min(kwidth - w_offset, in_width - iw_start) * in_depth;
+    std::min(kwidth - w_offset, in_width - iw_start) * in_depth;
   const int output_row_offset = (buffer_id * single_buffer_length);
   int out_offset =
-      output_row_offset + (h_offset * kwidth + w_offset) * in_depth;
-  int in_offset = Offset(input_dims, 0, iw_start, ih_start, b);
+    output_row_offset + (h_offset * kwidth + w_offset) * in_depth;
+  int in_offset = Offset(input_shape, b, ih_start, iw_start, 0);
 
   // Express all of the calculations as padding around the input patch.
   const int top_padding = h_offset;
@@ -55,7 +58,7 @@ inline void ExtractPatchIntoBufferColumn(
   // patch that are off the edge of the input image.
   if (top_padding > 0) {
     const int top_row_elements = (top_padding * kwidth * in_depth);
-    memset(conv_buffer_data + output_row_offset, byte_zero,
+    memset(conv_buffer_data + output_row_offset, zero_byte,
            (top_row_elements * sizeof(T)));
   }
 
@@ -72,14 +75,14 @@ inline void ExtractPatchIntoBufferColumn(
     for (int ih = ih_start; ih < ih_end; ++ih) {
       if (left_padding > 0) {
         const int left_start = (out_offset - (left_padding * in_depth));
-        memset(conv_buffer_data + left_start, byte_zero,
+        memset(conv_buffer_data + left_start, zero_byte,
                (left_padding * in_depth * sizeof(T)));
       }
       memcpy(conv_buffer_data + out_offset, in_data + in_offset,
              single_row_num * sizeof(T));
       if (right_padding > 0) {
         const int right_start = (out_offset + single_row_num);
-        memset(conv_buffer_data + right_start, byte_zero,
+        memset(conv_buffer_data + right_start, zero_byte,
                (right_padding * in_depth * sizeof(T)));
       }
       out_offset += kwidth_times_indepth;
@@ -92,28 +95,33 @@ inline void ExtractPatchIntoBufferColumn(
   if (bottom_padding > 0) {
     const int bottom_row_elements = (bottom_padding * kwidth * in_depth);
     const int bottom_start =
-        output_row_offset +
-        ((top_padding + (ih_end - ih_start)) * kwidth * in_depth);
-    memset(conv_buffer_data + bottom_start, byte_zero,
+      output_row_offset +
+      ((top_padding + (ih_end - ih_start)) * kwidth * in_depth);
+    memset(conv_buffer_data + bottom_start, zero_byte,
            (bottom_row_elements * sizeof(T)));
   }
 }
 
+/* Place Dilated Im2Col should be here when it is required */
+
 template <typename T>
-void Im2col(const T* input_data, const Dims<4>& input_dims, int stride_width,
-            int stride_height, int pad_width, int pad_height, int kheight,
-            int kwidth, uint8 byte_zero, T* output_data,
-            const Dims<4>& output_dims) {
-
-  TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
-  TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int input_depth = ArraySize(input_dims, 0);
-  const int input_width = ArraySize(input_dims, 1);
-  const int input_height = ArraySize(input_dims, 2);
-  const int output_depth = ArraySize(output_dims, 0);
-  const int output_width = ArraySize(output_dims, 1);
-  const int output_height = ArraySize(output_dims, 2);
+void Im2col(const ConvParams& params, int kheight, int kwidth, uint8 zero_byte,
+            const RuntimeShape& input_shape, const T* input_data,
+            const RuntimeShape& output_shape, T* output_data) {
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_depth = input_shape.Dims(3);
+  const int input_width = input_shape.Dims(2);
+  const int input_height = input_shape.Dims(1);
+  const int output_depth = output_shape.Dims(3);
+  const int output_width = output_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
 
   int buffer_id = 0;
   // Loop over the output nodes.
@@ -121,53 +129,109 @@ void Im2col(const T* input_data, const Dims<4>& input_dims, int stride_width,
     for (int h = 0; h < output_height; ++h) {
       for (int w = 0; w < output_width; ++w) {
         ExtractPatchIntoBufferColumn(
-            input_dims, w, h, b, kheight, kwidth, stride_width, stride_height,
-            pad_width, pad_height, input_width, input_height, input_depth,
-            output_depth, buffer_id, input_data, output_data, byte_zero);
+          input_shape, w, h, b, kheight, kwidth, stride_width, stride_height,
+          pad_width, pad_height, input_width, input_height, input_depth,
+          output_depth, buffer_id, input_data, output_data, zero_byte);
         ++buffer_id;
       }
     }
   }
 }
 
-inline void Conv(const float* input_data, const Dims<4>& input_dims,
-                 const float* filter_data, const Dims<4>& filter_dims,
-                 int stride_width, int stride_height,
-                 int pad_width, int pad_height,
-                 float* output_data, const Dims<4>& output_dims,
-                 float* im2col_data, const Dims<4>& im2col_dims) {
+inline void Conv(const ConvParams& params,
+                 const RuntimeShape& input_shape, const float* input_data,
+                 const RuntimeShape& filter_shape, const float* filter_data,
+                 const RuntimeShape& output_shape, float* output_data,
+                 const RuntimeShape& im2col_shape, float* im2col_data) {
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  /* Dilation
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+
+  const float output_activation_min = params.float_activation_min;
+  const float output_activation_max = params.float_activation_max;
+   */
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
   (void)im2col_data;
-  (void)im2col_dims;
+  (void)im2col_shape;
 
   // NB: static_cast<float>(0x00000000h) == 0.0f
   const uint8 float_zero_byte = 0x00;
   const float* gemm_input_data = nullptr;
-  const Dims<4>* gemm_input_dims = nullptr;
-  const int filter_width = ArraySize(filter_dims, 1);
-  const int filter_height = ArraySize(filter_dims, 2);
+  const RuntimeShape* gemm_input_shape = nullptr;
+  const int filter_width = filter_shape.Dims(2);
+  const int filter_height = filter_shape.Dims(1);
   const bool need_im2col = stride_width != 1 || stride_height != 1 ||
                            filter_width != 1 || filter_height != 1;
-  if (need_im2col) {
+  // Dilated im2col
+  /* const bool need_dilated_im2col =
+    dilation_width_factor != 1 || dilation_height_factor != 1;
+  if (need_dilated_im2col) {
+    DilatedIm2col(params, float_zero_byte, input_shape, input_data,
+                  filter_shape, output_shape, im2col_data);
+    gemm_input_data = im2col_data;
+    gemm_input_shape = &im2col_shape;
+  } else */if (need_im2col) {
     TFLITE_DCHECK(im2col_data);
-    Im2col(input_data, input_dims, stride_width, stride_height, pad_width,
-           pad_height, filter_height, filter_width, float_zero_byte,
-           im2col_data, im2col_dims);
+    Im2col(params, filter_height, filter_width, float_zero_byte, input_shape,
+           input_data, im2col_shape, im2col_data);
     gemm_input_data = im2col_data;
-    gemm_input_dims = &im2col_dims;
+    gemm_input_shape = &im2col_shape;
   } else {
     // TODO(aselle): We need to make sure to not send im2col if it is not
     // needed.
     TFLITE_DCHECK(!im2col_data);
     gemm_input_data = input_data;
-    gemm_input_dims = &input_dims;
+    gemm_input_shape = &input_shape;
   }
 
-  const auto im2col_matrix_map =
-      MapAsMatrixWithFirstDimAsRows(gemm_input_data, *gemm_input_dims);
-  const auto filter_matrix_map =
-      MapAsMatrixWithLastDimAsCols(filter_data, filter_dims);
-  auto output_matrix_map =
-      MapAsMatrixWithFirstDimAsRows(output_data, output_dims);
+  // The following code computes matrix multiplication c = a * transponse(b)
+  // with CBLAS, where:
+  // * `a` is a matrix with dimensions (m, k).
+  // * `b` is a matrix with dimensions (n, k), so transpose(b) is (k, n).
+  // * `c` is a matrix with dimensions (m, n).
+  // The naming of variables are aligned with CBLAS specification here.
+  const float* a = gemm_input_data;
+  const float* b = filter_data;
+  float* c = output_data;
+  const int gemm_input_dims = gemm_input_shape->DimensionsCount();
+  int m = FlatSizeSkipDim(*gemm_input_shape, gemm_input_dims - 1);
+  int n = output_shape.Dims(3);
+  int k = gemm_input_shape->Dims(gemm_input_dims - 1);
+
+#if defined(TF_LITE_USE_CBLAS) && defined(__APPLE__)
+  // The stride of matrix a, b and c respectively.
+  int stride_a = k;
+  int stride_b = k;
+  int stride_c = n;
+
+  cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, m, n, k, 1.0f, a,
+              stride_a, b, stride_b, 0.0f, c, stride_c);
+#else
+  // When an optimized CBLAS implementation is not available, fall back
+  // to using Eigen.
+  typedef Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>
+    Matrix;
+  typedef Eigen::Map<Matrix> MatrixRef;
+  typedef Eigen::Map<const Matrix> ConstMatrixRef;
+
+  MatrixRef matrix_c(c, m, n);
+  ConstMatrixRef matrix_a(a, m, k);
+  ConstMatrixRef matrix_b(b, n, k);
+
+  // The following special casing for when a or b is a vector is required
+  // as Eigen seem to fail to make this optimization on its own.
+  if (n == 1) {
+    matrix_c.col(0).noalias() = matrix_a * matrix_b.row(0).transpose();
+  } else if (m == 1) {
+    matrix_c.row(0).noalias() = matrix_a.row(0) * matrix_b.transpose();
+  } else {
+    matrix_c.noalias() = matrix_a * matrix_b.transpose();
+  }
 
-  Gemm(filter_matrix_map.transpose(), im2col_matrix_map, &output_matrix_map);
+#endif  //  defined(TF_LITE_USE_CBLAS) && defined(__APPLE__)
 }
diff --git a/contrib/nnc/passes/soft_backend/code_snippets/cpp_depthwise_conv.def b/contrib/nnc/passes/soft_backend/code_snippets/cpp_depthwise_conv.def
index b256982..83239ff 100644
--- a/contrib/nnc/passes/soft_backend/code_snippets/cpp_depthwise_conv.def
+++ b/contrib/nnc/passes/soft_backend/code_snippets/cpp_depthwise_conv.def
@@ -1,17 +1,15 @@
 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+// Implementation of float DepthwiseConv
 
 template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
 struct FloatDepthwiseConvKernel {};
@@ -750,7 +748,8 @@ struct FloatDepthwiseConvKernel<true, 4, 1> {
 // Accumulates the effect of one row of the filter, on a segment of one row
 // of the output, accessing the corresponding one row of the input.
 template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
-void FloatDepthwiseConvAccumRow(int stride, int input_depth, int input_width,
+void FloatDepthwiseConvAccumRow(int stride, int dilation_factor,
+                                int input_depth, int input_width,
                                 const float* input_data, int pad_width,
                                 int depth_multiplier, int filter_width,
                                 const float* filter_data,
@@ -778,90 +777,72 @@ void FloatDepthwiseConvAccumRow(int stride, int input_depth, int input_width,
     int out_x_loop_end_unclampled = 0;
     if (kAllowStrided) {
       if (stride == 2) {
-        out_x_loop_start_unclampled = (pad_width - filter_x + 1) / 2;
+        out_x_loop_start_unclampled =
+          (pad_width - dilation_factor * filter_x + 1) / 2;
         out_x_loop_end_unclampled =
-            (pad_width + input_width - filter_x + 1) / 2;
+          (pad_width + input_width - dilation_factor * filter_x + 1) / 2;
       } else if (stride == 4) {
-        out_x_loop_start_unclampled = (pad_width - filter_x + 3) / 4;
+        out_x_loop_start_unclampled =
+          (pad_width - dilation_factor * filter_x + 3) / 4;
         out_x_loop_end_unclampled =
-            (pad_width + input_width - filter_x + 3) / 4;
+          (pad_width + input_width - dilation_factor * filter_x + 3) / 4;
       } else {
         out_x_loop_start_unclampled =
-            (pad_width - filter_x + stride - 1) / stride;
-        out_x_loop_end_unclampled =
-            (pad_width + input_width - filter_x + stride - 1) / stride;
+          (pad_width - dilation_factor * filter_x + stride - 1) / stride;
+        out_x_loop_end_unclampled = (pad_width + input_width -
+                                     dilation_factor * filter_x + stride - 1) /
+                                    stride;
       }
     } else {
-      out_x_loop_start_unclampled = pad_width - filter_x;
-      out_x_loop_end_unclampled = pad_width + input_width - filter_x;
+      out_x_loop_start_unclampled = pad_width - dilation_factor * filter_x;
+      out_x_loop_end_unclampled =
+        pad_width + input_width - dilation_factor * filter_x;
     }
     // The kernel will have to iterate on the segment of the
     // output row that starts at out_x_loop_start and out_x_loop_end.
     const int out_x_loop_start =
-        std::max(out_x_buffer_start, out_x_loop_start_unclampled);
+      std::max(out_x_buffer_start, out_x_loop_start_unclampled);
     const int out_x_loop_end =
-        std::min(out_x_buffer_end, out_x_loop_end_unclampled);
+      std::min(out_x_buffer_end, out_x_loop_end_unclampled);
 
     float* acc_buffer_ptr =
-        acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
-    const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x;
+      acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
+    const int in_x_origin =
+      (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
     const float* input_ptr = input_data + in_x_origin * input_depth;
     const int num_output_pixels = out_x_loop_end - out_x_loop_start;
     FloatDepthwiseConvKernel<kAllowStrided, kFixedInputDepth,
-                             kFixedDepthMultiplier>::Run(num_output_pixels,
-                                                         input_depth,
-                                                         depth_multiplier,
-                                                         input_ptr,
-                                                         input_ptr_increment,
-                                                         filter_base_ptr,
-                                                         acc_buffer_ptr);
+      kFixedDepthMultiplier>::Run(num_output_pixels,
+                                  input_depth,
+                                  depth_multiplier,
+                                  input_ptr,
+                                  input_ptr_increment,
+                                  filter_base_ptr,
+                                  acc_buffer_ptr);
     filter_base_ptr += output_depth;
   }
 }
 
 // generic fallback of FloatDepthwiseConvAccumRow, portable, non-templatized.
 inline void FloatDepthwiseConvAccumRowGeneric(
-    int stride, int input_depth, int input_width, const float* input_data,
-    int pad_width, int depth_multiplier, int filter_width,
-    const float* filter_data, int out_x_buffer_start, int out_x_buffer_end,
-    int output_depth, float* acc_buffer) {
-#ifdef TFLITE_PREVENT_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
-#ifndef ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
-  LOG(FATAL)
-      << "\n\n"
-      << "*****************************************************************\n"
-      << "* This tfmini inference code was about to use the slow generic\n"
-      << "* fallback implementation for a DepthwiseConv op, and we want you\n"
-      << "* to be aware of that so that you will know why you get terrible\n"
-      << "* performance.\n"
-      << "*\n"
-      << "* If you would like to carry on with the slow code, compile\n"
-      << "* with this preprocessor token defined:\n"
-      << "* ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK.\n"
-      << "*\n"
-      << "* The right thing to do, if you care about performance, is to add\n"
-      << "* a new DepthwiseConv kernel to tfmini to cover your case.\n"
-      << "* The relevant parameters defining your case are:\n"
-      << "* stride = " << stride << "\n"
-      << "* input_depth = " << input_depth << "\n"
-      << "* depth_multiplier = " << depth_multiplier << "\n"
-      << "*\n"
-      << "* Please do not hesitate to contact benoitjacob@ with this\n"
-      << "* information.\n"
-      << "*****************************************************************\n";
-#endif  // ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
-#endif  // TFLITE_PREVENT_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
+  int stride, int dilation_factor, int input_depth, int input_width,
+  const float* input_data, int pad_width, int depth_multiplier,
+  int filter_width, const float* filter_data, int out_x_buffer_start,
+  int out_x_buffer_end, int output_depth, float* acc_buffer) {
   const float* filter_base_ptr = filter_data;
   for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
     const int out_x_loop_start = std::max(
-        out_x_buffer_start, (pad_width - filter_x + stride - 1) / stride);
-    const int out_x_loop_end =
-        std::min(out_x_buffer_end,
-                 (pad_width + input_width - filter_x + stride - 1) / stride);
+      out_x_buffer_start,
+      (pad_width - dilation_factor * filter_x + stride - 1) / stride);
+    const int out_x_loop_end = std::min(
+      out_x_buffer_end,
+      (pad_width + input_width - dilation_factor * filter_x + stride - 1) /
+      stride);
 
     float* acc_buffer_ptr =
-        acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
-    const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x;
+      acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
+    const int in_x_origin =
+      (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
     const float* input_ptr = input_data + in_x_origin * input_depth;
     const int input_ptr_increment = (stride - 1) * input_depth;
     for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++) {
@@ -879,33 +860,40 @@ inline void FloatDepthwiseConvAccumRowGeneric(
   }
 }
 
-// Initializes the accumulator buffer with zero values.
-inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth,
-                                       float* acc_buffer) {
-  for (int i = 0; i < num_output_pixels; i++)
-    for (int k = 0; k < output_depth; k++) {
-      acc_buffer[i * output_depth + k] = 0;
-    }
+// Initializes the accumulator buffer with zeros values.
+inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth, float* acc_buffer) {
+  memset(acc_buffer, 0, sizeof(acc_buffer[0]) * output_depth * num_output_pixels);
+
 }
 
-inline void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
-                          const float* filter_data, const Dims<4>& filter_dims,
-                          int stride_width, int stride_height,
-                          int pad_width, int pad_height,
-                          int depth_multiplier,
-                          float* output_data, const Dims<4>& output_dims) {
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int output_depth = MatchingArraySize(filter_dims, 0, output_dims, 0);
-  const int input_height = ArraySize(input_dims, 2);
-  const int input_width = ArraySize(input_dims, 1);
-  const int input_depth = ArraySize(input_dims, 0);
-  const int filter_height = ArraySize(filter_dims, 2);
-  const int filter_width = ArraySize(filter_dims, 1);
-  const int output_height = ArraySize(output_dims, 2);
-  const int output_width = ArraySize(output_dims, 1);
-  TFLITE_DCHECK(output_depth == input_depth * depth_multiplier);
+inline void DepthwiseConv(
+  const DepthwiseParams& params, const RuntimeShape& input_shape,
+  const float* input_data, const RuntimeShape& filter_shape,
+  const float* filter_data, const RuntimeShape& output_shape,
+  float* output_data) {
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int depth_multiplier = params.depth_multiplier;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
 
-  static const int kAccBufferMaxSize = 2048;
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int input_depth = input_shape.Dims(3);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
+
+  static const int kAccBufferMaxSize = 4832;
   float acc_buffer[kAccBufferMaxSize];
   TFLITE_DCHECK_GE(kAccBufferMaxSize, output_depth);
   const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth;
@@ -968,18 +956,26 @@ inline void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
     row_accum_func = FloatDepthwiseConvAccumRowGeneric;
   }
 
+  const int input_height_stride = input_shape.Dims(3) * input_shape.Dims(2);
+  const int input_batch_stride = input_height_stride * input_shape.Dims(1);
+  const int filter_height_stride = filter_shape.Dims(3) * filter_shape.Dims(2);
+
   // Now that we have determined row_accum_func, we can start work.
   float* output_ptr = output_data;
   for (int b = 0; b < batches; ++b) {
     for (int out_y = 0; out_y < output_height; ++out_y) {
       const int in_y_origin = (out_y * stride_height) - pad_height;
-      const int filter_y_start = std::max(0, -in_y_origin);
+      const int filter_y_start =
+        std::max(0, (-in_y_origin + dilation_height_factor - 1) /
+                    dilation_height_factor);
       const int filter_y_end =
-          std::min(filter_height, input_height - in_y_origin);
+        std::min(filter_height,
+                 (input_height - in_y_origin + dilation_height_factor - 1) /
+                 dilation_height_factor);
       for (int out_x_buffer_start = 0; out_x_buffer_start < output_width;
            out_x_buffer_start += kOutputPixelsInAccBuffer) {
         const int out_x_buffer_end = std::min(
-            output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
+          output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
         // We call a 'pixel' a group of activation that share all but the
         // 'depth'/'channel' coordinate. num_output_pixels is the number of
         // output pixels that we will accumulate in this loop iteration.
@@ -990,14 +986,13 @@ inline void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
         // Accumulation loop. Most of the time should be spent in here.
         for (int filter_y = filter_y_start; filter_y < filter_y_end;
              ++filter_y) {
-          const int in_y = in_y_origin + filter_y;
-          row_accum_func(stride_width, input_depth, input_width,
-                         input_data + in_y * input_dims.strides[2] +
-                             b * input_dims.strides[3],
-                         pad_width, depth_multiplier, filter_width,
-                         filter_data + filter_y * filter_dims.strides[2],
-                         out_x_buffer_start, out_x_buffer_end, output_depth,
-                         acc_buffer);
+          const int in_y = in_y_origin + dilation_height_factor * filter_y;
+          row_accum_func(
+            stride_width, dilation_width_factor, input_depth, input_width,
+            input_data + in_y * input_height_stride + b * input_batch_stride,
+            pad_width, depth_multiplier, filter_width,
+            filter_data + filter_y * filter_height_stride, out_x_buffer_start,
+            out_x_buffer_end, output_depth, acc_buffer);
         }
         // Finished accumulating. Now store to destination.
         const int num_output_values = output_depth * num_output_pixels;
@@ -1011,6 +1006,11 @@ inline void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
             acc[k] = vld1q_f32(acc_buffer + i + 4 * k);
           }
           for (int k = 0; k < 4; k++) {
+            acc[k] = vmaxq_f32(
+                vdupq_n_f32(output_activation_min),
+                vminq_f32(vdupq_n_f32(output_activation_max), acc[k]));
+          }
+          for (int k = 0; k < 4; k++) {
             vst1q_f32(output_ptr + 4 * k, acc[k]);
           }
           output_ptr += 16;
@@ -1018,6 +1018,10 @@ inline void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
         // Handle 4 values at a time
         for (; i <= num_output_values - 4; i += 4) {
           float32x4_t acc = vld1q_f32(acc_buffer + i);
+
+          acc = vmaxq_f32(vdupq_n_f32(output_activation_min),
+                          vminq_f32(vdupq_n_f32(output_activation_max), acc));
+
           vst1q_f32(output_ptr, acc);
           output_ptr += 4;
         }
@@ -1031,4 +1035,3 @@ inline void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
     }
   }
 }
-
diff --git a/contrib/nnc/passes/soft_backend/code_snippets/cpp_operations.def b/contrib/nnc/passes/soft_backend/code_snippets/cpp_operations.def
index f5bef77..34e58e7 100644
--- a/contrib/nnc/passes/soft_backend/code_snippets/cpp_operations.def
+++ b/contrib/nnc/passes/soft_backend/code_snippets/cpp_operations.def
@@ -93,7 +93,7 @@ size_t volume(Dims<rank> d)
   return v;
 }
 
-inline RuntimeShape shapeToRuntimeShape(const Shape &s) {
+RuntimeShape shapeToRuntimeShape(const Shape& s) {
   const int rank = s.getDims();
   RuntimeShape sh(rank);
   for (int i = 0; i < rank; i++) {
@@ -186,60 +186,58 @@ void concat(Tensor &out, const char *params, const Args &...inputs)
 
 void conv2d(Tensor& out, const char* params, const Tensor& input, const Tensor& kernel,
             Tensor& temporary) {
-  Shape strides = deserializeShape(params);
-  Shape pads = deserializeShape(params);
-  Shape out_shape = deserializeShape(params);
+  const Shape strides = deserializeShape(params);
+  const Shape pads = deserializeShape(params);
+  const Shape out_shape = deserializeShape(params);
   out.reShape(out_shape);
 
   assert(strides.getDims() == 2);
-  const auto stride_h = static_cast<int>(strides[0]);
-  const auto stride_w = static_cast<int>(strides[1]);
+  const auto stride_h = static_cast<int16>(strides[0]);
+  const auto stride_w = static_cast<int16>(strides[1]);
 
   assert(pads.getDims() == 2);
-  const auto pad_h = static_cast<int>(pads[0]);
-  const auto pad_w = static_cast<int>(pads[1]);
+  const auto pad_h = static_cast<int16>(pads[0]);
+  const auto pad_w = static_cast<int16>(pads[1]);
 
   // Transpose the kernel from HWIO to OHWI format.
-  Shape kernel_shape = kernel.getShape();
-  kernel_shape = {kernel_shape[3], kernel_shape[0], kernel_shape[1], kernel_shape[2]};
-  Dims<4> kernel_dims = shapeToDims(kernel_shape);
-  unique_ptr<float[]> kernel_data(new float[volume(kernel_dims)]);
-  TransposeParams transpose_params{4, {3, 0, 1, 2}};
-  Transpose(transpose_params,
-            shapeToRuntimeShape(kernel.getShape()), kernel.getData(),
-            shapeToRuntimeShape(kernel_shape), kernel_data.get());
-
-  Dims<4> out_dims = shapeToDims(out_shape);
-  Dims<4> im2col_dims{{kernel_dims.sizes[0] * kernel_dims.sizes[1] * kernel_dims.sizes[2],
-                          out_dims.sizes[1],
-                          out_dims.sizes[2],
-                          out_dims.sizes[3]},
-                      {}};
-
-  int stride = 1;
-  for (int i = 0; i < 4; ++i) {
-    im2col_dims.strides[i] = stride;
-    stride *= im2col_dims.sizes[i];
-  }
+  const Shape kernel_shape = kernel.getShape();
+  const RuntimeShape kernel_rt_shape = {static_cast<int>(kernel_shape[3]),
+                                  static_cast<int>(kernel_shape[0]),
+                                  static_cast<int>(kernel_shape[1]),
+                                  static_cast<int>(kernel_shape[2])};
+
+  const RuntimeShape out_rt_shape = shapeToRuntimeShape(out_shape);
+  const RuntimeShape im2col_shape{out_rt_shape.Dims(0), //batch
+                                  out_rt_shape.Dims(1), //height
+                                  out_rt_shape.Dims(2), //width
+                                  static_cast<int>(kernel_shape[2] *
+                                                   kernel_shape[0] *
+                                                   kernel_shape[1])};
 
   float* im2col_data = nullptr;
-  if (stride_w != 1 || stride_h != 1 || kernel_dims.sizes[1] != 1 || kernel_dims.sizes[2] != 1) {
+  if (stride_w != 1 || stride_h != 1 || kernel_shape[0] != 1 || kernel_shape[1] != 1) {
     im2col_data = temporary.getData();
   }
+  const ConvParams conv_params{{pad_w, pad_h}, stride_w, stride_h};
 
-  Conv(input.getData(), shapeToDims(input.getShape()),
-       kernel_data.get(), kernel_dims,
-       stride_w, stride_h,
-       pad_w, pad_h,
-       out.getData(), out_dims,
-       im2col_data, im2col_dims);
+  unique_ptr<float[]> kernel_data(new float[kernel_rt_shape.FlatSize()]);
+  TransposeParams transpose_params{4, {3, 0, 1, 2}};
+  Transpose(transpose_params,
+            shapeToRuntimeShape(kernel_shape), kernel.getData(),
+            kernel_rt_shape, kernel_data.get());
+
+  Conv(conv_params,
+       shapeToRuntimeShape(input.getShape()), input.getData(),
+       kernel_rt_shape, kernel_data.get(),
+       out_rt_shape, out.getData(),
+       im2col_shape, im2col_data);
 }
 
 void convTransposed2d(Tensor& out, const char* params, const Tensor& input, const Tensor& kernel,
                       Tensor& temporary) {
-  Shape strides = deserializeShape(params);
-  Shape pads = deserializeShape(params);
-  Shape out_shape = deserializeShape(params);
+  const Shape strides = deserializeShape(params);
+  const Shape pads = deserializeShape(params);
+  const Shape out_shape = deserializeShape(params);
   out.reShape(out_shape);
 
   assert(strides.getDims() == 2);
@@ -250,71 +248,76 @@ void convTransposed2d(Tensor& out, const char* params, const Tensor& input, cons
   const auto pad_h = static_cast<int16>(pads[0]);
   const auto pad_w = static_cast<int16>(pads[1]);
 
+  const RuntimeShape input_rt_shape = shapeToRuntimeShape(input.getShape());
+  const RuntimeShape out_rt_shape = shapeToRuntimeShape(out_shape);
+
   // Transpose the kernel from HWOI to OHWI format.
-  Shape kernel_shape = kernel.getShape();
-  kernel_shape = {kernel_shape[2], kernel_shape[0], kernel_shape[1], kernel_shape[3]};
-  Dims<4> kernel_dims = shapeToDims(kernel_shape);
-  unique_ptr<float[]> kernel_data(new float[volume(kernel_dims)]);
+  const Shape kernel_shape = kernel.getShape();
+  const RuntimeShape kernel_rt_shape = {static_cast<int>(kernel_shape[2]),
+                                  static_cast<int>(kernel_shape[0]),
+                                  static_cast<int>(kernel_shape[1]),
+                                  static_cast<int>(kernel_shape[3])};
+  unique_ptr<float[]> kernel_data(new float[kernel_rt_shape.FlatSize()]);
   TransposeParams transpose_params{4, {2, 0, 1, 3}};
   Transpose(transpose_params,
-      shapeToRuntimeShape(kernel.getShape()), kernel.getData(),
-      shapeToRuntimeShape(kernel_shape), kernel_data.get());
-
-  RuntimeShape input_rt_shape = shapeToRuntimeShape(input.getShape());
-  RuntimeShape out_rt_shape = shapeToRuntimeShape(out_shape);
-  RuntimeShape kernel_rt_shape = shapeToRuntimeShape(kernel_shape);
+            shapeToRuntimeShape(kernel_shape), kernel.getData(),
+            kernel_rt_shape, kernel_data.get());
 
   const int32 kernel_height = kernel_rt_shape.Dims(1);
   const int32 kernel_width = kernel_rt_shape.Dims(2);
 
-  RuntimeShape im2col_shape{out_rt_shape.Dims(0),
-                            out_rt_shape.Dims(1),
-                            out_rt_shape.Dims(2),
-                            input_rt_shape.Dims(3) * kernel_width * kernel_height};
+  const RuntimeShape im2col_shape{out_rt_shape.Dims(0),
+                                  out_rt_shape.Dims(1),
+                                  out_rt_shape.Dims(2),
+                                  input_rt_shape.Dims(3) * kernel_width * kernel_height};
 
-  ConvParams conv_params{PaddingType::kSame, {pad_w, pad_h}, stride_w, stride_h};
+  ConvParams conv_params{{pad_w, pad_h}, stride_w, stride_h};
 
   TransposeConv(conv_params,
-      input_rt_shape, input.getData(),
-      kernel_rt_shape, kernel_data.get(),
-      out_rt_shape, out.getData(),
-      im2col_shape, temporary.getData());
+                input_rt_shape, input.getData(),
+                kernel_rt_shape, kernel_data.get(),
+                out_rt_shape, out.getData(),
+                im2col_shape, temporary.getData());
 }
 
 void depthwiseConv2d(Tensor& out, const char* params, const Tensor& input, const Tensor& kernel) {
-  Shape strides = deserializeShape(params);
-  Shape pads = deserializeShape(params);
-  Shape out_shape = deserializeShape(params);
+  const Shape strides = deserializeShape(params);
+  const Shape pads = deserializeShape(params);
+  const Shape out_shape = deserializeShape(params);
   out.reShape(out_shape);
 
   assert(strides.getDims() == 2);
-  const auto stride_h = static_cast<int>(strides[0]);
-  const auto stride_w = static_cast<int>(strides[1]);
+  const auto stride_h = static_cast<int16>(strides[0]);
+  const auto stride_w = static_cast<int16>(strides[1]);
 
   assert(pads.getDims() == 2);
-  const auto pad_h = static_cast<int>(pads[0]);
-  const auto pad_w = static_cast<int>(pads[1]);
+  const auto pad_h = static_cast<int16>(pads[0]);
+  const auto pad_w = static_cast<int16>(pads[1]);
 
-  Dims<4> input_dims = shapeToDims(input.getShape());
-  Dims<4> kernel_dims = shapeToDims(kernel.getShape());
-  Dims<4> out_dims = shapeToDims(out_shape);
+  const RuntimeShape input_dims = shapeToRuntimeShape(input.getShape());
+  const RuntimeShape kernel_dims = shapeToRuntimeShape(kernel.getShape());
+  const RuntimeShape out_dims = shapeToRuntimeShape(out_shape);
 
-  int depth_multiplier = out_dims.sizes[0] / input_dims.sizes[0];
-  assert(out_dims.sizes[0] % input_dims.sizes[0] == 0);
+  const auto depth_multiplier = static_cast<int16>(out_dims.Dims(3) / input_dims.Dims(3));
+  assert(out_dims.Dims(3) % input_dims.Dims(3) == 0);
 
   // Reshape kernel -- squash zeroth and first dimensions.
-  const int output_channels = kernel_dims.sizes[0] * kernel_dims.sizes[1];
-  assert(output_channels == out_dims.sizes[0]);
-  const int kernel_w = kernel_dims.sizes[2];
-  const int kernel_h = kernel_dims.sizes[3];
-  kernel_dims = shapeToDims({kernel_h, kernel_w, output_channels});
-
-  DepthwiseConv(input.getData(), input_dims,
-                kernel.getData(), kernel_dims,
-                stride_w, stride_h,
-                pad_w, pad_h,
-                depth_multiplier,
-                out.getData(), out_dims);
+  const int output_channels = kernel_dims.Dims(3) * kernel_dims.Dims(2);
+  assert(output_channels == out_dims.Dims(3));
+  const int kernel_w = kernel_dims.Dims(1);
+  const int kernel_h = kernel_dims.Dims(0);
+  const RuntimeShape kernel_rt_shape = {1, kernel_h, kernel_w, output_channels};
+
+  const DepthwiseParams depthwise_conv_params = {{pad_w, pad_h}, stride_w,
+                                           stride_h, 1, 1,
+                                           depth_multiplier};
+
+  // TODO Fusing bias into depthwise conv is close to a no-op due to the nature of the operation
+  // consider doing that
+  DepthwiseConv(depthwise_conv_params,
+                input_dims, input.getData(),
+                kernel_rt_shape, kernel.getData(),
+                out_dims, out.getData());
 }
 
 void softmax(Tensor &out, const char *params, const Tensor &in)
diff --git a/contrib/nnc/unittests/soft_backend/CPPOperations.cpp b/contrib/nnc/unittests/soft_backend/CPPOperations.cpp
index 9245763..cc07592 100644
--- a/contrib/nnc/unittests/soft_backend/CPPOperations.cpp
+++ b/contrib/nnc/unittests/soft_backend/CPPOperations.cpp
@@ -604,7 +604,6 @@ TEST(cpp_operations_test, convTransposed2d) {
               auto pad_t = mir::ops::PaddingType::Same;
               auto op_generator = [&strides, pad_t](
                 mir::Graph& g, const std::vector<mir::IODescriptor>& inputs) {
-
                 return g.create<mir::ops::DeConv2DOp>("y", inputs[0], inputs[1], strides, pad_t);
               };
 
@@ -659,7 +658,7 @@ TEST(cpp_operations_test, depthwise_conv) {
         for (iT stride_w = 1; stride_w <= 3; ++stride_w)
           for (iT stride_h = 1; stride_h <= 3; ++stride_h)
             for (iT multiplier = 1; multiplier <= 2; ++multiplier) {
-              vector<int> input_shape_data{1, 5, 7, static_cast<int>(channels)};  // NHWC
+              vector<int> input_shape_data{1, 7, 6, static_cast<int>(channels)};  // NHWC
               vector<int> kernel_shape_data{kernel_h, kernel_w, channels, multiplier}; // HWCN
               mir::Shape strides{stride_h, stride_w};
               vector<unique_ptr<mir::TensorVariant>> input_ntensors(2);