From 7a8baad25060c1ab7925588df598ce2664572cd1 Mon Sep 17 00:00:00 2001 From: "jijoong.moon" Date: Thu, 7 Jul 2022 18:24:54 +0900 Subject: [PATCH] [ Layers ] Add Parallelization along batch direction This patch demostrate the batch direction parallelization with conv2d calcDerivatives. . add the meson option with 'nntr-num-threads' key and int value. . add extra compile option, NNTR_NUM_THREADS ( default value is 1 ) **Self evaluation:** 1. Build test: [X]Passed [ ]Failed [ ]Skipped 2. Run test: [X]Passed [ ]Failed [ ]Skipped Signed-off-by: jijoong.moon --- meson.build | 3 ++ meson_options.txt | 2 ++ nntrainer/layers/conv2d_layer.cpp | 58 +++++++++++++++++++++++++++++++++------ 3 files changed, 55 insertions(+), 8 deletions(-) diff --git a/meson.build b/meson.build index 2d1efeb..96f6448 100644 --- a/meson.build +++ b/meson.build @@ -156,6 +156,9 @@ if get_option('enable-blas') endif endif +extra_defines += '-DNNTR_NUM_THREADS=@0@'.format(get_option('nntr-num-threads')) +message('set nntrainer num threads=@0@'.format(get_option('nntr-num-threads'))) + openmp_dep = dummy_dep if get_option('enable-openmp') openmp_dep = dependency('openmp') diff --git a/meson_options.txt b/meson_options.txt index 7a55290..6f67cb3 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -21,6 +21,8 @@ option('capi-ml-common-actual', type: 'string', value: 'capi-ml-common', option('tizen-version-major', type: 'integer', min : 4, max : 9999, value: 9999) # 9999 means "not Tizen" option('tizen-version-minor', type: 'integer', min : 0, max : 9999, value: 0) option('openblas-num-threads', type: 'integer', min : 0, max : 9999, value: 0) +#This is for the multi-threading in nntrainer. ( multi-threading along batch direction ) +option('nntr-num-threads', type: 'integer', min : 0, max : 9999, value: 2) # test related option option('reduce-tolerance', type: 'boolean', value: true) diff --git a/nntrainer/layers/conv2d_layer.cpp b/nntrainer/layers/conv2d_layer.cpp index 6c685d1..32c4ad8 100644 --- a/nntrainer/layers/conv2d_layer.cpp +++ b/nntrainer/layers/conv2d_layer.cpp @@ -25,6 +25,7 @@ #include #include #include +#include #include namespace nntrainer { @@ -449,16 +450,57 @@ void Conv2DLayer::calcDerivative(RunLayerContext &context) { /// for each batch /// filter_kernel^T X derivaitive -> column matrix /// col2im(column matrix) to reconstruct the original image - Tensor &col2im_result = context.getTensor(wt_idx[ConvParams::inter_result]); - col2im_result.reshape(calcCol2ImOutputDim(derivative.getDim(), filter_dim)); - for (unsigned int b = 0; b < derivative.batch(); ++b) { - Tensor deriv_sub = derivative.getBatchSlice(b, 1); - Tensor in_deriv_sub = input_derivative.getBatchSlice(b, 1); - deriv_sub.reshape({filter_size, derivative.width() * derivative.height()}); + unsigned int num_threads = NNTR_NUM_THREADS; + + if (num_threads > derivative.batch()) + num_threads = 1; + + if (num_threads > 1) { + auto dowork = [&](size_t s, size_t e, void *user_data) { + for (size_t b = s; b < e; ++b) { + Tensor result = + Tensor(calcCol2ImOutputDim(derivative.getDim(), filter_dim)); + Tensor deriv_sub = derivative.getBatchSlice(b, 1); + Tensor in_deriv_sub = input_derivative.getBatchSlice(b, 1); + deriv_sub.reshape( + {filter_size, derivative.width() * derivative.height()}); + filter_kernel.dot(deriv_sub, result, true, false); + col2im(result, filter_dim, padding, stride, {1, 1}, in_deriv_sub); + } + }; + + size_t start = 0; + size_t end = derivative.batch(); + size_t chunk = (end - start + (num_threads - 1)) / num_threads; + + std::vector workers; + + for (unsigned int i = 0; i < num_threads; ++i) { + size_t s = start + i * chunk; + size_t e = s + chunk; + if (e > end) + e = end; + workers.push_back(std::thread(dowork, s, e, nullptr)); + } - filter_kernel.dot(deriv_sub, col2im_result, true, false); - col2im(col2im_result, filter_dim, padding, stride, dilation, in_deriv_sub); + for (unsigned int i = 0; i < num_threads; ++i) + workers[i].join(); + + } else { + + Tensor &col2im_result = context.getTensor(wt_idx[ConvParams::inter_result]); + col2im_result.reshape(calcCol2ImOutputDim(derivative.getDim(), filter_dim)); + + for (unsigned int b = 0; b < derivative.batch(); ++b) { + Tensor deriv_sub = derivative.getBatchSlice(b, 1); + Tensor in_deriv_sub = input_derivative.getBatchSlice(b, 1); + deriv_sub.reshape( + {filter_size, derivative.width() * derivative.height()}); + + filter_kernel.dot(deriv_sub, col2im_result, true, false); + col2im(col2im_result, filter_dim, padding, stride, dilation, in_deriv_sub); + } } filter_kernel.reshape(filter_dim); -- 2.7.4