From 8ee8c43e735607497e1adadcbd4cfdf4786a4215 Mon Sep 17 00:00:00 2001 From: Seungbaek Hong Date: Mon, 22 May 2023 11:32:26 +0900 Subject: [PATCH] [activation] add gelu function Added GELU activation function for supporting gpt. I created unittest for this using pytorch. **Self evaluation:** 1. Build test: [X]Passed [ ]Failed [ ]Skipped 2. Run test: [X]Passed [ ]Failed [ ]Skipped Signed-off-by: Seungbaek Hong --- api/ccapi/include/layer.h | 8 +++ nntrainer/layers/acti_func.cpp | 30 ++++++++++++ nntrainer/layers/acti_func.h | 18 +++++++ nntrainer/layers/common_properties.h | 8 +-- .../unittest/layers/unittest_layers_activation.cpp | 10 +++- test/unittest/unittest_nntrainer_activations.cpp | 57 ++++++++++++++++++++++ 6 files changed, 126 insertions(+), 5 deletions(-) diff --git a/api/ccapi/include/layer.h b/api/ccapi/include/layer.h index 1998cd9..9690b74 100644 --- a/api/ccapi/include/layer.h +++ b/api/ccapi/include/layer.h @@ -530,6 +530,14 @@ Swish(const std::vector &properties = {}) { } /** + * @brief Helper function to create gelu activation layer + */ +inline std::unique_ptr +GeLU(const std::vector &properties = {}) { + return Activation("Activation=gelu", properties); +} + +/** * @brief Helper function to create Tanh layer */ inline std::unique_ptr diff --git a/nntrainer/layers/acti_func.cpp b/nntrainer/layers/acti_func.cpp index fe52e00..d2370fd 100644 --- a/nntrainer/layers/acti_func.cpp +++ b/nntrainer/layers/acti_func.cpp @@ -152,6 +152,10 @@ void ActiFunc::setActiFunc(ActivationType acti_type) { in_place = false; this->setActivation(swish, swishPrime); break; + case ActivationType::ACT_GELU: + in_place = false; + this->setActivation(gelu, geluPrime); + break; case ActivationType::ACT_NONE: this->setActivation(no_op, no_op_prime); break; @@ -356,6 +360,32 @@ Tensor &ActiFunc::swishPrime(Tensor const &t_in, Tensor const &t_out, return outgoing_derivative; } +Tensor &ActiFunc::gelu(Tensor const &t_in, Tensor &t_out) { + float tmp = 1 / sqrt(2); + t_in.apply([&](float x) { return 0.5 * x * (1 + erf(x * tmp)); }, t_out); + return t_out; +} + +Tensor &ActiFunc::geluPrime(Tensor const &t_in, Tensor const &t_out, + Tensor &outgoing_derivative, + Tensor const &incoming_derivative) { + + if (outgoing_derivative.empty()) + outgoing_derivative = Tensor(t_out.getDim()); + + float tmp = 1 / sqrt(2); + t_in.apply( + [&](float x) { + return 0.5 * (1 + erf(x * tmp) + + x * ((2 / sqrt(M_PI)) * exp(-pow(x * tmp, 2))) * tmp); + }, + outgoing_derivative); + + outgoing_derivative.multiply_i_strided(incoming_derivative); + + return outgoing_derivative; +} + void ActiFunc::executeInPlace(bool val) { if (val && !supportInPlace()) throw std::runtime_error("Error setting activation layer to work in-place"); diff --git a/nntrainer/layers/acti_func.h b/nntrainer/layers/acti_func.h index b6d14c1..832fae3 100644 --- a/nntrainer/layers/acti_func.h +++ b/nntrainer/layers/acti_func.h @@ -190,6 +190,24 @@ public: Tensor const &incoming_derivative = Tensor()); /** + * @brief gelu activation function + * @param[in] t_in input tensor + * @param[in] t_out output tensor + */ + static Tensor &gelu(Tensor const &t_in, Tensor &t_out); + + /** + * @brief derivative gelu function + * @param[in] t_in input tensor + * @param[in] t_out output tensor + * @param[in] outgoing_derivative outgoing derivative + * @param[in] incoming_derivative incoming derivative + */ + static Tensor &geluPrime(Tensor const &t_in, Tensor const &t_out, + Tensor &outgoing_derivative, + Tensor const &incoming_derivative = Tensor()); + + /** * @brief setActivation by custom activation function * @note apply derivative as this activation_prime_fn does not utilize * derivative diff --git a/nntrainer/layers/common_properties.h b/nntrainer/layers/common_properties.h index 143dce6..72ab06e 100644 --- a/nntrainer/layers/common_properties.h +++ b/nntrainer/layers/common_properties.h @@ -34,6 +34,7 @@ enum class ActivationType { ACT_SIGMOID, /**< sigmoid */ ACT_RELU, /**< ReLU */ ACT_SWISH, /**< Swish */ + ACT_GELU, /**< GELU */ ACT_SOFTMAX, /**< softmax */ ACT_LEAKY_RELU, /**< Leaky ReLU */ ACT_NONE, /**< no op */ @@ -840,12 +841,13 @@ public: struct ActivationTypeInfo { using Enum = nntrainer::ActivationType; static constexpr std::initializer_list EnumList = { - Enum::ACT_TANH, Enum::ACT_SIGMOID, Enum::ACT_RELU, Enum::ACT_SOFTMAX, - Enum::ACT_LEAKY_RELU, Enum::ACT_SWISH, Enum::ACT_NONE, Enum::ACT_UNKNOWN}; + Enum::ACT_TANH, Enum::ACT_SIGMOID, Enum::ACT_RELU, + Enum::ACT_SOFTMAX, Enum::ACT_LEAKY_RELU, Enum::ACT_SWISH, + Enum::ACT_GELU, Enum::ACT_NONE, Enum::ACT_UNKNOWN}; static constexpr const char *EnumStr[] = {"tanh", "sigmoid", "relu", "softmax", "leaky_relu", "swish", - "none", "unknown"}; + "gelu", "none", "unknown"}; }; /** diff --git a/test/unittest/layers/unittest_layers_activation.cpp b/test/unittest/layers/unittest_layers_activation.cpp index a43f8fc..7746aa6 100644 --- a/test/unittest/layers/unittest_layers_activation.cpp +++ b/test/unittest/layers/unittest_layers_activation.cpp @@ -26,6 +26,11 @@ auto semantic_activation_swish = LayerSemanticsParamType( nntrainer::ActivationLayer::type, {"activation=swish"}, LayerCreateSetPropertyOptions::AVAILABLE_FROM_APP_CONTEXT, false, 1); +auto semantic_activation_gelu = LayerSemanticsParamType( + nntrainer::createLayer, + nntrainer::ActivationLayer::type, {"activation=gelu"}, + LayerCreateSetPropertyOptions::AVAILABLE_FROM_APP_CONTEXT, false, 1); + auto semantic_activation_sigmoid = LayerSemanticsParamType( nntrainer::createLayer, nntrainer::ActivationLayer::type, {"activation=sigmoid"}, @@ -49,5 +54,6 @@ auto semantic_activation_none = LayerSemanticsParamType( GTEST_PARAMETER_TEST( Activation, LayerSemantics, ::testing::Values(semantic_activation_relu, semantic_activation_swish, - semantic_activation_sigmoid, semantic_activation_softmax, - semantic_activation_tanh, semantic_activation_none)); + semantic_activation_gelu, semantic_activation_sigmoid, + semantic_activation_softmax, semantic_activation_tanh, + semantic_activation_none)); diff --git a/test/unittest/unittest_nntrainer_activations.cpp b/test/unittest/unittest_nntrainer_activations.cpp index 25c1bfb..477dd19 100644 --- a/test/unittest/unittest_nntrainer_activations.cpp +++ b/test/unittest/unittest_nntrainer_activations.cpp @@ -348,6 +348,63 @@ TEST(nntrainer_activation, swishPrime_01_p) { } } +TEST(nntrainer_activation, gelu_01_p) { + int batch = 3; + int channel = 1; + int height = 1; + int width = 10; + float answer[30] = { + -0.13783135, -0.11462659, -0.08414805, -0.04601721, 0, + 0.05398279, 0.11585195, 0.18537343, 0.26216868, 0.34573120, + -0.16948429, -0.16455182, -0.13783135, -0.08414805, 0, + 0.11585195, 0.26216868, 0.43544820, 0.63051575, 0.84134471, + -0.13808367, -0.16565408, -0.16455182, -0.11462659, 0, + 0.18537343, 0.43544820, 0.73434591, 1.06191635, 1.39978909}; + + nntrainer::Tensor input(batch, channel, height, width); + GEN_TEST_INPUT(input, (l - 4) * 0.1 * (i + 1)); + + nntrainer::Tensor results(batch, channel, height, width); + results = nntrainer::ActiFunc::gelu(input, results); + + float *data = results.getData(); + ASSERT_NE(nullptr, data); + + for (int i = 0; i < batch * height * width; ++i) { + EXPECT_NEAR(data[i], answer[i], tolerance); + } +} + +TEST(nntrainer_activation, geluPrime_01_p) { + int batch = 3; + int channel = 1; + int height = 1; + int width = 10; + + float answer[30] = { + 0.19727029, 0.26767227, 0.34253171, 0.42047682, 0.5, 0.57952315, + 0.65746832, 0.73232776, 0.80272973, 0.86749506, -0.01989788, 0.07431830, + 0.19727029, 0.34253171, 0.5, 0.65746832, 0.80272973, 0.92568171, + 1.01989794, 1.08331537, -0.11795351, -0.05541664, 0.07431830, 0.26767227, + 0.5, 0.73232776, 0.92568171, 1.05541658, 1.11795354, 1.12746918}; + + nntrainer::Tensor input(batch, channel, height, width); + GEN_TEST_INPUT(input, (l - 4) * 0.1 * (i + 1)); + + nntrainer::Tensor results(batch, channel, height, width); + nntrainer::ActiFunc::gelu(input, results); + + nntrainer::Tensor prime_results(batch, channel, height, width); + nntrainer::ActiFunc::geluPrime(input, results, prime_results); + + float *data = prime_results.getData(); + ASSERT_NE(nullptr, data); + + for (int i = 0; i < batch * height * width; ++i) { + EXPECT_NEAR(data[i], answer[i], tolerance); + } +} + /** * @brief Main gtest */ -- 2.7.4