From 7a8baad25060c1ab7925588df598ce2664572cd1 Mon Sep 17 00:00:00 2001
From: "jijoong.moon" <jijoong.moon@samsung.com>
Date: Thu, 7 Jul 2022 18:24:54 +0900
Subject: [PATCH] [ Layers ] Add Parallelization along batch direction

This patch demostrate the batch direction parallelization with conv2d
calcDerivatives.
. add the meson option with 'nntr-num-threads' key and int value.
. add extra compile option, NNTR_NUM_THREADS ( default value is 1 )

**Self evaluation:**
1. Build test:	 [X]Passed [ ]Failed [ ]Skipped
2. Run test:	 [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: jijoong.moon <jijoong.moon@samsung.com>
---
 meson.build                       |  3 ++
 meson_options.txt                 |  2 ++
 nntrainer/layers/conv2d_layer.cpp | 58 +++++++++++++++++++++++++++++++++------
 3 files changed, 55 insertions(+), 8 deletions(-)
diff --git a/meson.build b/meson.build
index 2d1efeb..96f6448 100644
--- a/meson.build
+++ b/meson.build
@@ -156,6 +156,9 @@ if get_option('enable-blas')
   endif
 endif
 
+extra_defines += '-DNNTR_NUM_THREADS=@0@'.format(get_option('nntr-num-threads'))
+message('set nntrainer num threads=@0@'.format(get_option('nntr-num-threads')))
+
 openmp_dep = dummy_dep
 if get_option('enable-openmp')
   openmp_dep = dependency('openmp')
diff --git a/meson_options.txt b/meson_options.txt
index 7a55290..6f67cb3 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -21,6 +21,8 @@ option('capi-ml-common-actual', type: 'string', value: 'capi-ml-common',
 option('tizen-version-major', type: 'integer', min : 4, max : 9999, value: 9999) # 9999 means "not Tizen"
 option('tizen-version-minor', type: 'integer', min : 0, max : 9999, value: 0)
 option('openblas-num-threads', type: 'integer', min : 0, max : 9999, value: 0)
+#This is for the multi-threading in nntrainer. ( multi-threading along batch direction )
+option('nntr-num-threads', type: 'integer', min : 0, max : 9999, value: 2)
 
 # test related option
 option('reduce-tolerance', type: 'boolean', value: true)
diff --git a/nntrainer/layers/conv2d_layer.cpp b/nntrainer/layers/conv2d_layer.cpp
index 6c685d1..32c4ad8 100644
--- a/nntrainer/layers/conv2d_layer.cpp
+++ b/nntrainer/layers/conv2d_layer.cpp
@@ -25,6 +25,7 @@
 #include <node_exporter.h>
 #include <profiler.h>
 #include <tensor_dim.h>
+#include <thread>
 #include <util_func.h>
 
 namespace nntrainer {
@@ -449,16 +450,57 @@ void Conv2DLayer::calcDerivative(RunLayerContext &context) {
   /// for each batch
   /// filter_kernel^T X derivaitive  -> column matrix
   /// col2im(column matrix) to reconstruct the original image
-  Tensor &col2im_result = context.getTensor(wt_idx[ConvParams::inter_result]);
-  col2im_result.reshape(calcCol2ImOutputDim(derivative.getDim(), filter_dim));
 
-  for (unsigned int b = 0; b < derivative.batch(); ++b) {
-    Tensor deriv_sub = derivative.getBatchSlice(b, 1);
-    Tensor in_deriv_sub = input_derivative.getBatchSlice(b, 1);
-    deriv_sub.reshape({filter_size, derivative.width() * derivative.height()});
+  unsigned int num_threads = NNTR_NUM_THREADS;
+
+  if (num_threads > derivative.batch())
+    num_threads = 1;
+
+  if (num_threads > 1) {
+    auto dowork = [&](size_t s, size_t e, void *user_data) {
+      for (size_t b = s; b < e; ++b) {
+        Tensor result =
+          Tensor(calcCol2ImOutputDim(derivative.getDim(), filter_dim));
+        Tensor deriv_sub = derivative.getBatchSlice(b, 1);
+        Tensor in_deriv_sub = input_derivative.getBatchSlice(b, 1);
+        deriv_sub.reshape(
+          {filter_size, derivative.width() * derivative.height()});
+        filter_kernel.dot(deriv_sub, result, true, false);
+        col2im(result, filter_dim, padding, stride, {1, 1}, in_deriv_sub);
+      }
+    };
+
+    size_t start = 0;
+    size_t end = derivative.batch();
+    size_t chunk = (end - start + (num_threads - 1)) / num_threads;
+
+    std::vector<std::thread> workers;
+
+    for (unsigned int i = 0; i < num_threads; ++i) {
+      size_t s = start + i * chunk;
+      size_t e = s + chunk;
+      if (e > end)
+        e = end;
+      workers.push_back(std::thread(dowork, s, e, nullptr));
+    }
 
-    filter_kernel.dot(deriv_sub, col2im_result, true, false);
-    col2im(col2im_result, filter_dim, padding, stride, dilation, in_deriv_sub);
+    for (unsigned int i = 0; i < num_threads; ++i)
+      workers[i].join();
+
+  } else {
+
+    Tensor &col2im_result = context.getTensor(wt_idx[ConvParams::inter_result]);
+    col2im_result.reshape(calcCol2ImOutputDim(derivative.getDim(), filter_dim));
+
+    for (unsigned int b = 0; b < derivative.batch(); ++b) {
+      Tensor deriv_sub = derivative.getBatchSlice(b, 1);
+      Tensor in_deriv_sub = input_derivative.getBatchSlice(b, 1);
+      deriv_sub.reshape(
+        {filter_size, derivative.width() * derivative.height()});
+
+      filter_kernel.dot(deriv_sub, col2im_result, true, false);
+      col2im(col2im_result, filter_dim, padding, stride, dilation, in_deriv_sub);
+    }
   }
 
   filter_kernel.reshape(filter_dim);
-- 
2.7.4