From 275cb9d5a9095819c34bdfe630c211ddda7122a1 Mon Sep 17 00:00:00 2001 From: =?utf8?q?=D0=90=D0=BD=D0=B4=D1=80=D0=B5=D0=B9=20=D0=A8=D0=B5=D0=B4?= =?utf8?q?=D1=8C=D0=BA=D0=BE/AI=20Tools=20Lab=20/SRR/Assistant=20Engineer/?= =?utf8?q?=EC=82=BC=EC=84=B1=EC=A0=84=EC=9E=90?= Date: Mon, 12 Nov 2018 12:18:36 +0300 Subject: [PATCH] [nnc] Style transfer soft backend no conv transpose (#1934) Added snippets for new ops (from TFL), As well as Soft Backend Tests for the ops Signed-off-by: Andrei Shedko --- contrib/nnc/core/modelIR/ShapeInference.cpp | 2 +- contrib/nnc/passes/interpreter/Interpreter.cpp | 2 +- contrib/nnc/passes/soft_backend/CPPGenerator.cpp | 6 ++ contrib/nnc/passes/soft_backend/ModelAnalyzer.cpp | 26 +++-- contrib/nnc/passes/soft_backend/SBSerializer.cpp | 4 +- .../soft_backend/code_snippets/cpp_elementwise.def | 114 +++++++++++++++++++++ .../passes/soft_backend/code_snippets/cpp_elu.def | 29 ++++++ .../soft_backend/code_snippets/cpp_operations.def | 42 +++++++- .../passes/soft_backend/code_snippets/cpp_tanh.def | 21 ++++ .../passes/soft_backend/code_snippets/eigen.def | 44 ++++++++ .../nnc/unittests/soft_backend/CPPOperations.cpp | 87 ++++++++++++++++ 11 files changed, 366 insertions(+), 11 deletions(-) create mode 100644 contrib/nnc/passes/soft_backend/code_snippets/cpp_elementwise.def create mode 100644 contrib/nnc/passes/soft_backend/code_snippets/cpp_elu.def create mode 100644 contrib/nnc/passes/soft_backend/code_snippets/cpp_tanh.def diff --git a/contrib/nnc/core/modelIR/ShapeInference.cpp b/contrib/nnc/core/modelIR/ShapeInference.cpp index 605cb32..65ba3d0 100644 --- a/contrib/nnc/core/modelIR/ShapeInference.cpp +++ b/contrib/nnc/core/modelIR/ShapeInference.cpp @@ -292,7 +292,7 @@ void ShapeInference::visit(INode::Ref node, ops::DeConv2DOp &op) { assert(kernel_shape.rank() == 4); assert(in_shape.rank() == 3); - assert(kernel_shape.dim(3) == in_shape.dim(2) && kernel_shape.dim(2) && in_shape.dim(2)); + assert(kernel_shape.dim(3) == in_shape.dim(2)); auto pad_type = op.getPaddingType(); diff --git a/contrib/nnc/passes/interpreter/Interpreter.cpp b/contrib/nnc/passes/interpreter/Interpreter.cpp index 430c803..e1dfb1e 100644 --- a/contrib/nnc/passes/interpreter/Interpreter.cpp +++ b/contrib/nnc/passes/interpreter/Interpreter.cpp @@ -272,7 +272,7 @@ void NNInterpreter::visit(INode::Ref node, ops::EluOp &op) { if (input.at(id) >= 0) return input.at(id); else - return op.getAlpha()*(exp2f(input.at(id))-1); + return op.getAlpha()*(expf(input.at(id))-1); })(); } diff --git a/contrib/nnc/passes/soft_backend/CPPGenerator.cpp b/contrib/nnc/passes/soft_backend/CPPGenerator.cpp index b70c855..6a776bf 100644 --- a/contrib/nnc/passes/soft_backend/CPPGenerator.cpp +++ b/contrib/nnc/passes/soft_backend/CPPGenerator.cpp @@ -40,6 +40,9 @@ using namespace std; #include "cpp_scale.generated.h" #include "cpp_dropout.generated.h" #include "cpp_batchnorm.generated.h" +#include "cpp_elu.generated.h" +#include "cpp_tanh.generated.h" +#include "cpp_elementwise.generated.h" namespace nnc { @@ -277,6 +280,9 @@ void CPPCodeGenerator::materializeCode(ostream &out, const ModelAnalyzer &ma, co out.write(cpp_pool, sizeof(cpp_pool)); out.write(cpp_relu, sizeof(cpp_relu)); out.write(cpp_softmax, sizeof(cpp_softmax)); + out.write(cpp_elementwise, sizeof(cpp_elementwise)); + out.write(cpp_elu, sizeof(cpp_elu)); + out.write(cpp_tanh, sizeof(cpp_tanh)); out.write(cpp_operations, sizeof(cpp_operations)); out.write(cpp_scale, sizeof(cpp_scale)); diff --git a/contrib/nnc/passes/soft_backend/ModelAnalyzer.cpp b/contrib/nnc/passes/soft_backend/ModelAnalyzer.cpp index 0f6bc34..92dfbd9 100644 --- a/contrib/nnc/passes/soft_backend/ModelAnalyzer.cpp +++ b/contrib/nnc/passes/soft_backend/ModelAnalyzer.cpp @@ -203,19 +203,33 @@ void ModelAnalyzer::visit(INode *node, ops::BatchNormOp &op) { } void ModelAnalyzer::visit(mir::INode *node, mir::ops::TanhOp &op) { - addOpDescr(node, "TanhOp"); + addOpDescr(node, "tanh"); } void ModelAnalyzer::visit(mir::INode *node, mir::ops::ElementwiseOp &op) { - addOpDescr(node, "Elementwise"); + const char *funcName = nullptr; + switch ( op.getOpType() ) { + case ops::ElementwiseOp::OpType::sum: + funcName = "ElementWise"; + break; + case ops::ElementwiseOp::OpType::prod: + funcName = "ElementWise"; + break; + case ops::ElementwiseOp::OpType::max: + funcName = "ElementWise"; + break; + default: + assert(false && "unsupported elementwise operation type"); + } + addOpDescr(node, funcName); } -void ModelAnalyzer::visit(mir::INode *node, mir::ops::DeConv2DOp &op) { - addOpDescr(node, "DeConv2DOp"); +void ModelAnalyzer::visit(mir::INode *node, mir::ops::EluOp &op) { + addOpDescr(node, "elu"); } -void ModelAnalyzer::visit(mir::INode *node, mir::ops::EluOp &op) { - addOpDescr(node, "EluOp"); +void ModelAnalyzer::visit(mir::INode *node, mir::ops::DeConv2DOp &op) { + addOpDescr(node, "transposedconv2d"); } void ModelAnalyzer::visit(INode* node, ops::SqueezeOp& op) { diff --git a/contrib/nnc/passes/soft_backend/SBSerializer.cpp b/contrib/nnc/passes/soft_backend/SBSerializer.cpp index e2788b9..f674eca 100644 --- a/contrib/nnc/passes/soft_backend/SBSerializer.cpp +++ b/contrib/nnc/passes/soft_backend/SBSerializer.cpp @@ -303,7 +303,7 @@ void Serializer::visit(mir::INode *node, mir::ops::TanhOp &op) { void Serializer::visit(mir::INode *node, mir::ops::ElementwiseOp &op) { _curOp->_paramStartOffset = _buffer.size(); - serializeT((int32_t ) op.getOpType()); + // Op type is known at codegen Time serializeT((int32_t) op.getNumInputs()); } @@ -316,7 +316,7 @@ void Serializer::visit(mir::INode *node, mir::ops::DeConv2DOp &op) { _curOp->_paramStartOffset = _buffer.size(); // serialize kernel shared_ptr HWCNKernel = make_shared(op.getKernel()); - // HWCN -> NHWC + // HWCN -> "IN"HW"OUT" shared_ptr NHWCKernel = transposeTensor<3, 0, 1, 2>(HWCNKernel); serializeTensor(*NHWCKernel); // serialize strides diff --git a/contrib/nnc/passes/soft_backend/code_snippets/cpp_elementwise.def b/contrib/nnc/passes/soft_backend/code_snippets/cpp_elementwise.def new file mode 100644 index 0000000..96f2a1d --- /dev/null +++ b/contrib/nnc/passes/soft_backend/code_snippets/cpp_elementwise.def @@ -0,0 +1,114 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +struct Add { + static inline void Add_(const float* input1_data, const float* input2_data, + float* output_data, const int size) { + int i = 0; +#ifdef USE_NEON + + for (; i <= size - 16; i += 16) { + auto a10 = vld1q_f32(input1_data + i); + auto a11 = vld1q_f32(input1_data + i + 4); + auto a12 = vld1q_f32(input1_data + i + 8); + auto a13 = vld1q_f32(input1_data + i + 12); + auto a20 = vld1q_f32(input2_data + i); + auto a21 = vld1q_f32(input2_data + i + 4); + auto a22 = vld1q_f32(input2_data + i + 8); + auto a23 = vld1q_f32(input2_data + i + 12); + auto x0 = vaddq_f32(a10, a20); + auto x1 = vaddq_f32(a11, a21); + auto x2 = vaddq_f32(a12, a22); + auto x3 = vaddq_f32(a13, a23); + vst1q_f32(output_data + i, x0); + vst1q_f32(output_data + i + 4, x1); + vst1q_f32(output_data + i + 8, x2); + vst1q_f32(output_data + i + 12, x3); + } + for (; i <= size - 4; i += 4) { + auto a1 = vld1q_f32(input1_data + i); + auto a2 = vld1q_f32(input2_data + i); + auto x = vaddq_f32(a1, a2); + x = vmaxq_f32(activation_min, x); + x = vminq_f32(activation_max, x); + vst1q_f32(output_data + i, x); + } +#endif // NEON + + for (; i < size; i++) { + output_data[i] = input1_data[i] + input2_data[i]; + } + } + + static inline void Call(const float* input1_data, const float* input2_data, + float* output_data, Dims<4> dims) { + Add_(input1_data, input2_data, output_data, FlatSize(dims)); + } +}; + +struct Max { + static inline void Call(const float* input1_data, const float* input2_data, + float* output_data, Dims<4> dims) { + auto output = MapAsVector(output_data, dims); + output = output.cwiseMax( MapAsVector(input2_data, dims) ); + } +}; + +struct Mul { + static inline void Call(const float* input1_data, const float* input2_data, + float* output_data, Dims<4> dims) { + Mul_(input1_data, input2_data, output_data, FlatSize(dims)); + } + + static inline void Mul_(const float* input1_data, const float* input2_data, + float* output_data, const int size) { + + int i = 0; +#ifdef USE_NEON + const auto activation_min = vdupq_n_f32(output_activation_min); + const auto activation_max = vdupq_n_f32(output_activation_max); + for (; i <= size - 16; i += 16) { + auto a10 = vld1q_f32(input1_data + i); + auto a11 = vld1q_f32(input1_data + i + 4); + auto a12 = vld1q_f32(input1_data + i + 8); + auto a13 = vld1q_f32(input1_data + i + 12); + auto a20 = vld1q_f32(input2_data + i); + auto a21 = vld1q_f32(input2_data + i + 4); + auto a22 = vld1q_f32(input2_data + i + 8); + auto a23 = vld1q_f32(input2_data + i + 12); + auto x0 = vmulq_f32(a10, a20); + auto x1 = vmulq_f32(a11, a21); + auto x2 = vmulq_f32(a12, a22); + auto x3 = vmulq_f32(a13, a23); + + vst1q_f32(output_data + i, x0); + vst1q_f32(output_data + i + 4, x1); + vst1q_f32(output_data + i + 8, x2); + vst1q_f32(output_data + i + 12, x3); + } + for (; i <= size - 4; i += 4) { + auto a1 = vld1q_f32(input1_data + i); + auto a2 = vld1q_f32(input2_data + i); + auto x = vmulq_f32(a1, a2); + + vst1q_f32(output_data + i, x); + } +#endif // NEON + + for (; i < size; i++) { + output_data[i] = input1_data[i] * input2_data[i]; + } + } +}; diff --git a/contrib/nnc/passes/soft_backend/code_snippets/cpp_elu.def b/contrib/nnc/passes/soft_backend/code_snippets/cpp_elu.def new file mode 100644 index 0000000..8e41222 --- /dev/null +++ b/contrib/nnc/passes/soft_backend/code_snippets/cpp_elu.def @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +inline void ELU(const float* input_data, const Dims<4>& input_dims, + float alpha, float* output_data, + const Dims<4>& output_dims) { + + const auto input = MapAsVector(input_data, input_dims); + auto output = MapAsVector(output_data, output_dims); + + output = input.array().min(0.0f); + // Separate op for EXP vectorization + output = output.array().exp(); + output = (output.array() - 1.0f) * alpha; + output += input.cwiseMax(0.0f); +} diff --git a/contrib/nnc/passes/soft_backend/code_snippets/cpp_operations.def b/contrib/nnc/passes/soft_backend/code_snippets/cpp_operations.def index b65ff0d..3a059d6 100644 --- a/contrib/nnc/passes/soft_backend/code_snippets/cpp_operations.def +++ b/contrib/nnc/passes/soft_backend/code_snippets/cpp_operations.def @@ -377,6 +377,47 @@ void relu(Tensor &out, const char *params, const Tensor &in) Relu(input, input_d, out.getData(), input_d); } +void elu(Tensor &out, const char* params, const Tensor& in) { + const float* input = in.getData(); + const Dims<4> inp_d = shapeToDims(in.getShape()); + + const float alpha = deserializeT(params); + out.reShape(in.getShape()); + + ELU(input, inp_d, alpha, out.getData(), inp_d); +} + +void tanhActivation(Tensor &out, const char* params, const Tensor& in) { + UNUSED(params); + const float* input = in.getData(); + const Dims<4> inp_d = shapeToDims(in.getShape()); + + out.reShape(in.getShape()); + + float* output = out.getData(); + const Dims<4> out_d = shapeToDims(in.getShape()); + Tanh(input, inp_d, output, out_d); +} + +// These operations (add, mul, max) takes as input multiple tensors, at least 2, likely less then 7 +// parameter pack provides generalization for all possible number of inputs +template +void ElementWise(Tensor &out, + const char *params, const Args &...inputs) { + const float *input[] = {inputs.getData()...}; + + auto ins = std::tie(inputs...); + + const Dims<4> out_d = shapeToDims(std::get<0>(ins).getShape()); + const int32_t num_inputs = deserializeT(params); + + out.reShape(std::get<0>(ins).getShape()); + out.fillData(input[0]); + for (int32_t i = 1; i < num_inputs; ++i) { + F::Call(out.getData(), input[i], out.getData(), out_d); + } +} + void reshape(Tensor &out, const char *params, const Tensor &in) { Shape out_s = deserializeShape(params); @@ -385,4 +426,3 @@ void reshape(Tensor &out, const char *params, const Tensor &in) out.reShape(out_s); out.fillData(in.getData()); } - diff --git a/contrib/nnc/passes/soft_backend/code_snippets/cpp_tanh.def b/contrib/nnc/passes/soft_backend/code_snippets/cpp_tanh.def new file mode 100644 index 0000000..a85a6c5 --- /dev/null +++ b/contrib/nnc/passes/soft_backend/code_snippets/cpp_tanh.def @@ -0,0 +1,21 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +inline void Tanh(const float* input_data, const Dims<4>& input_dims, + float* output_data, const Dims<4>& output_dims) { + auto input_map = MapAsVector(input_data, input_dims); + auto output_map = MapAsVector(output_data, output_dims); + output_map.array() = input_map.array().tanh(); +} diff --git a/contrib/nnc/passes/soft_backend/code_snippets/eigen.def b/contrib/nnc/passes/soft_backend/code_snippets/eigen.def index 886ab0c..b871953 100644 --- a/contrib/nnc/passes/soft_backend/code_snippets/eigen.def +++ b/contrib/nnc/passes/soft_backend/code_snippets/eigen.def @@ -21365,6 +21365,50 @@ template const typename T::Scalar* extract_data(const T& m) #endif namespace Eigen { namespace internal { + template + T generic_fast_tanh_float(const T& a_x) + { + // Clamp the inputs to the range [-9, 9] since anything outside + // this range is +/-1.0f in single-precision. + const T plus_9 = pset1(9.f); + const T minus_9 = pset1(-9.f); + const T x = pmax(pmin(a_x, plus_9), minus_9); + // The monomial coefficients of the numerator polynomial (odd). + const T alpha_1 = pset1(4.89352455891786e-03f); + const T alpha_3 = pset1(6.37261928875436e-04f); + const T alpha_5 = pset1(1.48572235717979e-05f); + const T alpha_7 = pset1(5.12229709037114e-08f); + const T alpha_9 = pset1(-8.60467152213735e-11f); + const T alpha_11 = pset1(2.00018790482477e-13f); + const T alpha_13 = pset1(-2.76076847742355e-16f); + + // The monomial coefficients of the denominator polynomial (even). + const T beta_0 = pset1(4.89352518554385e-03f); + const T beta_2 = pset1(2.26843463243900e-03f); + const T beta_4 = pset1(1.18534705686654e-04f); + const T beta_6 = pset1(1.19825839466702e-06f); + + // Since the polynomials are odd/even, we need x^2. + const T x2 = pmul(x, x); + + // Evaluate the numerator polynomial p. + T p = pmadd(x2, alpha_13, alpha_11); + p = pmadd(x2, p, alpha_9); + p = pmadd(x2, p, alpha_7); + p = pmadd(x2, p, alpha_5); + p = pmadd(x2, p, alpha_3); + p = pmadd(x2, p, alpha_1); + p = pmul(x, p); + + // Evaluate the denominator polynomial p. + T q = pmadd(x2, beta_6, beta_4); + q = pmadd(x2, q, beta_2); + q = pmadd(x2, q, beta_0); + + // Divide the numerator by the denominator. + return pdiv(p, q); + } + struct constructor_without_unaligned_array_assert {}; template EIGEN_DEVICE_FUNC diff --git a/contrib/nnc/unittests/soft_backend/CPPOperations.cpp b/contrib/nnc/unittests/soft_backend/CPPOperations.cpp index f7d5be6..23a4f52 100644 --- a/contrib/nnc/unittests/soft_backend/CPPOperations.cpp +++ b/contrib/nnc/unittests/soft_backend/CPPOperations.cpp @@ -35,6 +35,9 @@ #include "code_snippets/cpp_pool.def" #include "code_snippets/cpp_relu.def" #include "code_snippets/cpp_softmax.def" +#include "code_snippets/cpp_elu.def" +#include "code_snippets/cpp_elementwise.def" +#include "code_snippets/cpp_tanh.def" #include "CommonData.def" #include "code_snippets/cpp_header_types.def" @@ -59,6 +62,10 @@ #include "core/modelIR/operations/BiasAddOp.h" #include "core/modelIR/operations/SoftmaxOp.h" #include "core/modelIR/operations/ScaleOp.h" +#include "core/modelIR/operations/EluOp.h" +#include "core/modelIR/operations/ElementwiseOp.h" +#include "core/modelIR/operations/Deconv2DOp.h" +#include "core/modelIR/operations/TanhOp.h" // various headers #include "core/modelIR/TensorVariant.h" @@ -335,6 +342,64 @@ TEST(cpp_operations_test, concat) } } + +TEST(cpp_operations_test, add2) { + for (int numDims = 2; numDims <= 4; ++numDims) { + // test prerequisites + vector shape_data{2, 3, 5, 7}; + shape_data.resize(numDims); + vector input_a_tensors(2); + vector> input_n_tensors(2); + fillTensors(input_n_tensors[0], input_a_tensors[0], shape_data, 1.0f); + fillTensors(input_n_tensors[1], input_a_tensors[1], shape_data, 2.0f); + auto op_generator = [](mir::Graph& g) { + return g.create("y", mir::ops::ElementwiseOp::OpType::sum, 2); + }; + + createAndRunTestGraph(op_generator, ElementWise, input_n_tensors, input_a_tensors[0], + input_a_tensors[1]); + } +} + +TEST(cpp_operations_test, mul3) { + for (int numDims = 2; numDims <= 4; ++numDims) { + // test prerequisites + vector shape_data{2, 3, 5, 7}; + shape_data.resize(numDims); + vector input_a_tensors(3); + vector> input_n_tensors(3); + fillTensors(input_n_tensors[0], input_a_tensors[0], shape_data, 1.0f); + fillTensors(input_n_tensors[1], input_a_tensors[1], shape_data, 2.0f); + fillTensors(input_n_tensors[2], input_a_tensors[2], shape_data, 3.0f); + auto opGenerator = [](mir::Graph& g) { + return g.create("y", mir::ops::ElementwiseOp::OpType::prod, 3); + }; + + createAndRunTestGraph(opGenerator, ElementWise, input_n_tensors, input_a_tensors[0], + input_a_tensors[1], input_a_tensors[2]); + } +} + +TEST(cpp_operations_test, max4) { + for (int numDims = 2; numDims <= 4; ++numDims) { + // test prerequisites + vector shape_data{2, 3, 5, 7}; + shape_data.resize(numDims); + vector input_a_tensors(4); + vector> input_n_tensors(4); + fillTensors(input_n_tensors[0], input_a_tensors[0], shape_data, 1.0f); + fillTensors(input_n_tensors[1], input_a_tensors[1], shape_data, 2.0f); + fillTensors(input_n_tensors[2], input_a_tensors[2], shape_data, 3.0f); + fillTensors(input_n_tensors[3], input_a_tensors[3], shape_data, 3.0f); + auto opGenerator = [](mir::Graph& g) { + return g.create("y", mir::ops::ElementwiseOp::OpType::max, 4); + }; + + createAndRunTestGraph(opGenerator, ElementWise, input_n_tensors, input_a_tensors[0], + input_a_tensors[1], input_a_tensors[2], input_a_tensors[3]); + } +} + TEST(cpp_operations_test, conv2d) { // Iterate over kernel width, kernel height, @@ -472,6 +537,28 @@ TEST(cpp_operations_test, relu) createAndRunTestGraph(opGenerator, relu, inputNTensors, aInputTensor); } +TEST(cpp_operations_test, elu) { + // test prerequisites + vector shape_data{2,3,4,5}; + Tensor a_input_tensor; + vector> input_n_tensors(1); + fillTensors(input_n_tensors[0], a_input_tensor, shape_data, 1.0f); + auto op_generator = [](mir::Graph &g){return g.create("y", 1);}; + + createAndRunTestGraph(op_generator, elu, input_n_tensors, a_input_tensor); +} + +TEST(cpp_operations_test, tanh) { + // test prerequisites + vector shape_data{2,3,4,5}; + Tensor a_input_tensor; + vector> input_n_tensors(1); + fillTensors(input_n_tensors[0], a_input_tensor, shape_data, 1.0f); + auto op_generator = [](mir::Graph &g){return g.create("y");}; + + createAndRunTestGraph(op_generator, tanhActivation, input_n_tensors, a_input_tensor); +} + TEST(cpp_operations_test, softmax) { // iterate over number of dimensions in tensor -- 2.7.4