From 4319e18dc813e7aa305c77366e930bac2efc53bc Mon Sep 17 00:00:00 2001 From: =?utf8?q?=D0=90=D0=BD=D0=B4=D1=80=D0=B5=D0=B9=20=D0=A8=D0=B5=D0=B4?= =?utf8?q?=D1=8C=D0=BA=D0=BE/AI=20Tools=20Lab=20/SRR/Engineer/=EC=82=BC?= =?utf8?q?=EC=84=B1=EC=A0=84=EC=9E=90?= Date: Wed, 16 Jan 2019 19:45:39 +0300 Subject: [PATCH] [nnc] Add temporary im2col buffer reuse (#2859) Add temporary buffer reuse to soft backend for Conv Implemented via a temporary tensor Signed-off-by: Andrei Shedko --- contrib/nnc/passes/soft_backend/CPPGenerator.cpp | 3 +++ contrib/nnc/passes/soft_backend/ModelAnalyzer.cpp | 29 +++++++++++++++++++--- contrib/nnc/passes/soft_backend/ModelAnalyzer.h | 25 ++++++++++++++++--- .../soft_backend/code_snippets/cpp_operations.def | 18 +++++--------- .../nnc/unittests/soft_backend/CPPOperations.cpp | 21 +++++++++------- 5 files changed, 69 insertions(+), 27 deletions(-) diff --git a/contrib/nnc/passes/soft_backend/CPPGenerator.cpp b/contrib/nnc/passes/soft_backend/CPPGenerator.cpp index 1e26996..af905e7 100644 --- a/contrib/nnc/passes/soft_backend/CPPGenerator.cpp +++ b/contrib/nnc/passes/soft_backend/CPPGenerator.cpp @@ -244,6 +244,9 @@ void CPPCodeGenerator::printGetter(ostream &out, const string &className, const void CPPCodeGenerator::materializeInferenceSequence(ostream &out, const ModelAnalyzer &ma) { using OpDescr = OpDescr; + // Allocate temporary(im2col) tensor + out << " Tensor " << _formattedTensors[ma.getTempTID()] << + "(Shape{" << ma.getMaxTemporarySize() << "});\n"; for (const OpDescr &op: ma.getInferenceSequence()) { if (op._op->getType() == mir::Operation::Type::variable) diff --git a/contrib/nnc/passes/soft_backend/ModelAnalyzer.cpp b/contrib/nnc/passes/soft_backend/ModelAnalyzer.cpp index 77e12de..ae35ab4 100644 --- a/contrib/nnc/passes/soft_backend/ModelAnalyzer.cpp +++ b/contrib/nnc/passes/soft_backend/ModelAnalyzer.cpp @@ -63,7 +63,8 @@ namespace nnc using namespace nnc::mir; -void ModelAnalyzer::addOpDescr(Operation* op, const string& function_name) { +void ModelAnalyzer::addOpDescr( + Operation* op, const string& function_name, std::vector aux_args = {}) { vector node_output_tensors; const string &op_name = op->getName(); @@ -97,6 +98,11 @@ void ModelAnalyzer::addOpDescr(Operation* op, const string& function_name) { node_input_tensors.push_back(inTid); } + // this op uses temporary memory (e.g. im2col) + if (!aux_args.empty()) { + std::copy(aux_args.begin(), aux_args.end(), std::back_inserter(node_input_tensors)); + } + _inferenceSequence.push_back({op, function_name, std::move(node_input_tensors), std::move(node_output_tensors), @@ -104,6 +110,10 @@ void ModelAnalyzer::addOpDescr(Operation* op, const string& function_name) { _opToDescr[op] = &_inferenceSequence.back(); } +void ModelAnalyzer::updateMaxTemporarySize(const size_t size) { + _max_temp_size = std::max(_max_temp_size, size); +} + size_t ModelAnalyzer::declareInputTensor(const std::string& name, const mir::Shape& shape) { assert(!name.empty() && "Input tensor must have name"); size_t id = _allocatedTensors++; @@ -144,6 +154,9 @@ void ModelAnalyzer::analyze(const mir::Graph* g) { auto constants = g->collectConstants(); init_ops.insert(init_ops.end(), constants.begin(), constants.end()); + // Register temporary tensor for im2col buffer + _temp_tensor_id = declareTemporaryTensor(); + // Walk all network inputs for (Operation* in : init_ops) { assert(dynamic_cast(in) || dynamic_cast(in)); @@ -191,7 +204,12 @@ void ModelAnalyzer::visit(ops::ConcatOp& op) { } void ModelAnalyzer::visit(ops::Conv2DOp& op) { - addOpDescr(&op, "conv2d"); + const auto& kernel_shape = op.getKernel().getShape(); + const auto& out_shape = op.getOutputShape(0); + const int32_t tmp_size = kernel_shape.dim(0) * kernel_shape.dim(1) * kernel_shape.dim(2) + * out_shape.dim(0) * out_shape.dim(1) * out_shape.dim(2); + updateMaxTemporarySize(static_cast(tmp_size)); + addOpDescr(&op, "conv2d", {_temp_tensor_id}); } void ModelAnalyzer::visit(ops::DepthwiseConv2DOp& op) { @@ -314,7 +332,12 @@ void ModelAnalyzer::visit(mir::ops::EluOp& op) { } void ModelAnalyzer::visit(mir::ops::DeConv2DOp& op) { - addOpDescr(&op, "convTransposed2d"); + const auto& kernel_shape = op.getKernel().getShape(); + const auto& out_shape = op.getOutputShape(0); + const int32_t tmp_size = kernel_shape.dim(0) * kernel_shape.dim(1) * kernel_shape.dim(3) * + out_shape.dim(0) * out_shape.dim(1) * out_shape.dim(2); + updateMaxTemporarySize(static_cast(tmp_size)); + addOpDescr(&op, "convTransposed2d", {_temp_tensor_id}); } void ModelAnalyzer::visit(ops::SqueezeOp& op) { diff --git a/contrib/nnc/passes/soft_backend/ModelAnalyzer.h b/contrib/nnc/passes/soft_backend/ModelAnalyzer.h index 1afb2ab..700e966 100644 --- a/contrib/nnc/passes/soft_backend/ModelAnalyzer.h +++ b/contrib/nnc/passes/soft_backend/ModelAnalyzer.h @@ -28,6 +28,8 @@ #include #include #include +#include +#include namespace nnc { @@ -71,6 +73,7 @@ struct OpDescr { // list of output tensors std::vector _outputs; size_t _paramStartOffset; + std::list _temporaries; }; /** @@ -166,16 +169,32 @@ public: return _modelName; } + const size_t getMaxTemporarySize() const { + return _max_temp_size; + } + + const size_t getTempTID() const { + return _temp_tensor_id; + } + private: /** * @brief Common function to add function call in inference sequence * @param op Node representing added call * @param function_name Function name + * @param aux_args Auxilliary argument ids * * Inserts information about CG operation into inference sequence: name of operation, * creates tensors for operation outputs, binds operation inputs with tensors from previous operations */ - void addOpDescr(mir::Operation* op, const std::string& function_name); + void addOpDescr(mir::Operation* op, + const std::string& function_name, std::vector aux_args); + + /** + * @brief Registers a temporary buffer of size *size* used by op *op_id* + * @param size Size of buffer + */ + void updateMaxTemporarySize(const size_t size); /** * @brief Declares input tensor in artifact @@ -201,13 +220,13 @@ private: std::string _modelName = "NN"; std::list _inferenceSequence; size_t _allocatedTensors = 0; - - /// @brief list of artifact inputs std::vector _inputs; /// @brief list of persistent tensors std::vector _persistent_tensors; /// @brief list of tensor ids corresponding to NN outputs std::vector _outputs; + size_t _max_temp_size = 0; + size_t _temp_tensor_id; std::vector _tensors; std::map _opToDescr; }; diff --git a/contrib/nnc/passes/soft_backend/code_snippets/cpp_operations.def b/contrib/nnc/passes/soft_backend/code_snippets/cpp_operations.def index 8ab0889..2aaabca 100644 --- a/contrib/nnc/passes/soft_backend/code_snippets/cpp_operations.def +++ b/contrib/nnc/passes/soft_backend/code_snippets/cpp_operations.def @@ -237,8 +237,7 @@ void concat(Tensor &out, const char *params, const Args &...inputs) out.getData(), shapeToDims(out.getShape())); } -void conv2d(Tensor &out, const char *params, const Tensor &in) -{ +void conv2d(Tensor& out, const char* params, const Tensor& in, Tensor& temporary) { const float *input = in.getData(); Dims<4> input_d = shapeToDims(in.getShape()); Kernel kernel = deserializeKernel(params); @@ -269,10 +268,10 @@ void conv2d(Tensor &out, const char *params, const Tensor &in) const int pad_w = pads[1]; const int pad_h = pads[0]; - unique_ptr im2col(nullptr, [](float *d){delete [] d;}); + float* im2col_data = nullptr; if (stride_w != 1 || stride_h != 1 || kernel.dims.sizes[1] != 1 || kernel.dims.sizes[2] != 1) { - im2col.reset(new float[volume(im2col_d)]); + im2col_data = temporary.getData(); } Conv(input, input_d, @@ -280,10 +279,10 @@ void conv2d(Tensor &out, const char *params, const Tensor &in) stride_w, stride_h, pad_w, pad_h, out.getData(), out_d, - im2col.get(), im2col_d); + im2col_data, im2col_d); } -void convTransposed2d(Tensor &out, const char *params, const Tensor &in) { +void convTransposed2d(Tensor& out, const char* params, const Tensor& in, Tensor& temporary) { const float *input = in.getData(); RuntimeShape input_shape = shapeToRuntimeShape(in.getShape()); KernelRT kernel = deserializeKernelRT(params); @@ -316,14 +315,9 @@ void convTransposed2d(Tensor &out, const char *params, const Tensor &in) { const auto convPara = ConvParams({PaddingType::kSame, PaddingValues({pad_w,pad_h}), stride_w, stride_h}); - unique_ptr im2col(nullptr, [](float *d){delete [] d;}); - if (stride_w != 1 || stride_h != 1 || kernel.shape.Dims(1) != 1 || kernel.shape.Dims(2) != 1) { - im2col.reset(new float[im2col_shape.FlatSize()]); - } - TransposeConv( convPara, input_shape, input, kernel.shape, kernel.data, - out_shape, out.getData(), im2col_shape, im2col.get()); + out_shape, out.getData(), im2col_shape, temporary.getData()); } void depthwiseConv2d(Tensor &out, const char *params, const Tensor &in) diff --git a/contrib/nnc/unittests/soft_backend/CPPOperations.cpp b/contrib/nnc/unittests/soft_backend/CPPOperations.cpp index 4234bc0..eeded6b 100644 --- a/contrib/nnc/unittests/soft_backend/CPPOperations.cpp +++ b/contrib/nnc/unittests/soft_backend/CPPOperations.cpp @@ -119,7 +119,7 @@ namespace { */ mir::Operation* fillGraph(mir::Graph& g, - function& inputs)> op_gen, + const function& inputs)>& op_gen, const vector>& input_ntensors) { // Create inputs std::vector inputs; @@ -173,7 +173,7 @@ void fillShapes(mir::Shape &nshape, Shape &ashape, const vector &raw_shape_ void fillNTensor(mir::TensorVariant &dst, float start) { float t = start; mir::Tensor wrapper(dst); - for (mir::Index idx: mir::ShapeRange(dst.getShape())) { + for (const mir::Index& idx: mir::ShapeRange(dst.getShape())) { wrapper.at(idx) = sin(t) * 2.0f; t += 1.0f; } @@ -314,11 +314,11 @@ void compareResults(const mir::TensorVariant &ref_nnc_tensor, const Tensor &test */ template void createAndRunTestGraph( - function& inputs)> op_generator, - TestFunc artifactOperation, - const vector> &input_ntensors, - const Args &...input_atensors) { + function& inputs)> op_generator, + TestFunc artifactOperation, + const vector>& input_ntensors, + Args& ...input_atensors) { mir::Graph g; mir::Operation *actual_operation = fillGraph(g, op_generator, input_ntensors); @@ -586,6 +586,7 @@ TEST(cpp_operations_test, convTransposed2d) { // stride width, stride height // size 3 is chosen to cover all cases, where width bigger/smaller then height and equal/not equal to 1 using iT = int32_t; + Tensor temporary(Shape({1024 * 40})); for (iT kernel_h = 2; kernel_h <= 4; ++kernel_h) for (iT kernel_w = 2; kernel_w <= 4; ++kernel_w) for (iT input_c = 1; input_c <= 3; ++input_c) @@ -606,7 +607,8 @@ TEST(cpp_operations_test, convTransposed2d) { return g.create("y", inputs[0], kernel, strides, pad_t); }; - createAndRunTestGraph(op_generator, convTransposed2d, input_ntensors, input_atensor); + createAndRunTestGraph(op_generator, convTransposed2d, input_ntensors, input_atensor, + temporary); } } @@ -616,6 +618,7 @@ TEST(cpp_operations_test, conv2d) { // stride width, stride height // size 3 is chosen to cover all cases, where width bigger/smaller then height and equal/not equal to 1 using iT = int32_t; + Tensor temporary(Shape({1024 * 20})); for (iT kernel_h = 1; kernel_h <= 3; ++kernel_h) for (iT kernel_w = 1; kernel_w <= 3; ++kernel_w) for (iT input_c = 1; input_c <= 3; ++input_c) @@ -636,7 +639,7 @@ TEST(cpp_operations_test, conv2d) { padding); }; - createAndRunTestGraph(op_generator, conv2d, input_ntensors, input_atensor); + createAndRunTestGraph(op_generator, conv2d, input_ntensors, input_atensor, temporary); } } -- 2.7.4