Added naive version of OpenCL implementation for SwiGLU Layer.
Incorporated kernel for ops used.
Added unit test for SwiGLU_layer_cl.
Signed-off-by: Niket Agarwal <niket.a@samsung.com>
* @see https://github.com/nnstreamer/nntrainer
* @author Parichay Kapoor <pk.kapoor@samsung.com>
* @author Debadri Samaddar <s.debadri@samsung.com>
+ * @author Niket Agarwal <niket.a@samsung.com>
* @bug No known bugs except for NYI items
* @brief This is layers interface for c++ API
*
* @brief Enumeration of layer type
*/
enum LayerType {
- LAYER_IN = ML_TRAIN_LAYER_TYPE_INPUT, /**< Input Layer type */
- LAYER_FC = ML_TRAIN_LAYER_TYPE_FC, /**< Fully Connected Layer type */
- LAYER_BN = ML_TRAIN_LAYER_TYPE_BN, /**< Batch Normalization Layer type */
+ LAYER_IN = ML_TRAIN_LAYER_TYPE_INPUT, /**< Input Layer type */
+ LAYER_FC = ML_TRAIN_LAYER_TYPE_FC, /**< Fully Connected Layer type */
+ LAYER_SWIGLU = ML_TRAIN_LAYER_TYPE_SWIGLU, /**< Swiglu Layer type */
+ LAYER_BN = ML_TRAIN_LAYER_TYPE_BN, /**< Batch Normalization Layer type */
LAYER_CONV2D = ML_TRAIN_LAYER_TYPE_CONV2D, /**< Convolution 2D Layer type */
LAYER_POOLING2D = ML_TRAIN_LAYER_TYPE_POOLING2D, /**< Pooling 2D Layer type */
LAYER_FLATTEN = ML_TRAIN_LAYER_TYPE_FLATTEN, /**< Flatten Layer type */
return createLayer(LayerType::LAYER_FC, properties, compute_engine);
}
+/**
+ * @brief Helper function to create Swiglu layer
+ */
+inline std::unique_ptr<Layer>
+Swiglu(const std::vector<std::string> &properties = {},
+ const LayerComputeEngine &compute_engine = LayerComputeEngine::CPU) {
+ return createLayer(LayerType::LAYER_SWIGLU, properties, compute_engine);
+}
+
/**
* @brief Helper function to create batch normalization layer
*/
ML_TRAIN_LAYER_TYPE_POSITIONAL_ENCODING =
28, /**< Positional Encoding Layer type (Since 7.0) */
ML_TRAIN_LAYER_TYPE_IDENTITY = 29, /**< Identity Layer type (Since 8.0) */
+ ML_TRAIN_LAYER_TYPE_SWIGLU = 30, /**< Swiglu Layer type */
ML_TRAIN_LAYER_TYPE_PREPROCESS_FLIP =
300, /**< Preprocess flip Layer (Since 6.5) */
ML_TRAIN_LAYER_TYPE_PREPROCESS_TRANSLATE =
* @date 23 Feb 2024
* @see https://github.com/nnstreamer/nntrainer
* @author Debadri Samaddar <s.debadri@samsung.com>
+ * @author Niket Agarwal <niket.a@samsung.com>
* @bug No known bugs except for NYI items
* @brief This file contains app context related functions and classes that
* manages the global configuration of the current OpenCL environment. It also
#include <addition_layer_cl.h>
#include <cl_context.h>
#include <fc_layer_cl.h>
+#include <swiglu_cl.h>
namespace nntrainer {
cc.registerFactory(nntrainer::createLayer<AdditionLayerCL>,
AdditionLayerCL::type,
ml::train::LayerType::LAYER_ADDITION);
+
+ cc.registerFactory(nntrainer::createLayer<SwiGLULayerCl>, SwiGLULayerCl::type,
+ ml::train::LayerType::LAYER_SWIGLU);
}
static void registerer(ClContext &cc) noexcept {
cl_layer_sources = [
'fc_layer_cl.cpp',
'addition_layer_cl.cpp',
+ 'swiglu_cl.cpp',
]
foreach s : cl_layer_sources
--- /dev/null
+// SPDX-License-Identifier: Apache-2.0
+/**
+ *
+ * @file swiglu_cl.cpp
+ * @date 6th June 2024
+ * @brief Implementation of SwiGLU activation function
+ * @see https://github.com/nnstreamer/nntrainer
+ * @author Niket Agarwal <niket.a@samsung.com>
+ * @bug No known bugs except for NYI items
+ *
+ */
+
+#include "swiglu_cl.h"
+#include <iostream>
+
+std::string swiglu_cl_kernel_fp16_ =
+ R"(
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
+ __kernel void swiglu_cl_fp16(__global const half *in1, __global const half *in2, __global half *out) {
+ int i = get_global_id(0);
+ half swish = in1[i] * exp(in1[i]) / (1 + exp(in1[i]));
+ out[i] = swish * in2[i];
+})";
+
+std::string swiglu_cl_kernel_ =
+ R"(__kernel void swiglu_cl(__global const float *in1, __global const float *in2, __global float *out) {
+ int i = get_global_id(0);
+ float swish = in1[i] * exp(in1[i]) / (1 + exp(in1[i]));
+ out[i] = swish * in2[i];
+})";
+
+namespace nntrainer {
+
+static constexpr size_t OUT_IDX = 0;
+static constexpr size_t INPUT_IDX_1 = 0;
+static constexpr size_t INPUT_IDX_2 = 1;
+
+void SwiGLULayerCl::finalize(nntrainer::InitLayerContext &context) {
+ context.setOutputDimensions({context.getInputDimensions()[0]});
+}
+
+void SwiGLULayerCl::forwarding(RunLayerContext &context, bool training) {
+ Tensor &in1 = context.getInput(INPUT_IDX_1);
+ Tensor &in2 = context.getInput(INPUT_IDX_2);
+ Tensor &out = context.getOutput(OUT_IDX);
+ swigluProcess(in1, in2, out, context);
+}
+
+void SwiGLULayerCl::incremental_forwarding(RunLayerContext &context,
+ unsigned int from, unsigned int to,
+ bool training) {
+ Tensor &in1 = context.getInput(INPUT_IDX_1);
+ Tensor &in2 = context.getInput(INPUT_IDX_2);
+ Tensor &out = context.getOutput(OUT_IDX);
+
+ if (from) {
+ NNTR_THROW_IF(to - from != 1, std::invalid_argument)
+ << "incremental step size is not 1";
+ from = 0;
+ to = 1;
+ }
+
+ swigluProcess(in1, in2, out, context);
+}
+
+opencl::Kernel SwiGLULayerCl::kernel_swiglu;
+opencl::Kernel SwiGLULayerCl::kernel_swiglu_fp16;
+
+void SwiGLULayerCl::swigluProcess(Tensor const &in1, Tensor const &in2,
+ Tensor &result, RunLayerContext &context) {
+
+ unsigned int dim1, dim2;
+ dim1 = in1.batch() * in1.channel() * in1.height();
+ dim2 = in1.width();
+
+ if (in1.getDataType() == ml::train::TensorDim::DataType::FP32) {
+ const float *data1 = in1.getData();
+ const float *data2 = in2.getData();
+ float *rdata = result.getData();
+ swiglu_cl(data1, data2, rdata, dim1, dim2, context);
+ } else if (in1.getDataType() == ml::train::TensorDim::DataType::FP16) {
+#ifdef ENABLE_FP16
+ const _FP16 *data1 = in1.getData<_FP16>();
+ const _FP16 *data2 = in2.getData<_FP16>();
+ _FP16 *rdata = result.getData<_FP16>();
+ swiglu_cl_fp16(data1, data2, rdata, dim1, dim2, context);
+#else
+ throw std::invalid_argument("Error: enable-fp16 is not enabled");
+#endif
+ }
+}
+
+void SwiGLULayerCl::swiglu_cl(const float *matAdata, const float *vecXdata,
+ float *vecYdata, unsigned int dim1,
+ unsigned int dim2, RunLayerContext &context) {
+
+ bool result = false;
+
+ do {
+ result =
+ context.clCreateKernel(swiglu_cl_kernel_, context.LayerKernel::SWIGLU,
+ SwiGLULayerCl::kernel_swiglu);
+ if (!result) {
+ break;
+ }
+
+ int dim = int(dim1 * dim2);
+ opencl::Buffer inputA(context.context_inst_, sizeof(float) * dim1 * dim2, true,
+ nullptr);
+
+ opencl::Buffer inputX(context.context_inst_, sizeof(float) * dim1 * dim2, true,
+ nullptr);
+
+ opencl::Buffer inOutY(context.context_inst_, sizeof(float) * dim1 * dim2, true,
+ nullptr);
+
+ result = inputA.WriteData(context.command_queue_inst_, matAdata);
+ if (!result) {
+ break;
+ }
+
+ result = inputX.WriteData(context.command_queue_inst_, vecXdata);
+ if (!result) {
+ break;
+ }
+
+ result = inOutY.WriteData(context.command_queue_inst_, vecYdata);
+ if (!result) {
+ break;
+ }
+
+ result = SwiGLULayerCl::kernel_swiglu.SetKernelArguments(0, &inputA,
+ sizeof(cl_mem));
+ if (!result) {
+ break;
+ }
+
+ result = SwiGLULayerCl::kernel_swiglu.SetKernelArguments(1, &inputX,
+ sizeof(cl_mem));
+ if (!result) {
+ break;
+ }
+
+ result = SwiGLULayerCl::kernel_swiglu.SetKernelArguments(2, &inOutY,
+ sizeof(cl_mem));
+ if (!result) {
+ break;
+ }
+
+ const int work_groups_count[3] = {dim, 1, 1};
+ const int work_group_size[3] = {32, 32, 1}; // test-value
+
+ result = context.command_queue_inst_.DispatchCommand(
+ SwiGLULayerCl::kernel_swiglu, work_groups_count, work_group_size);
+ if (!result) {
+ break;
+ }
+
+ result = inOutY.ReadData(context.command_queue_inst_, vecYdata);
+ if (!result) {
+ break;
+ }
+
+ } while (false);
+}
+
+void SwiGLULayerCl::swiglu_cl_fp16(const __fp16 *matAdata,
+ const __fp16 *vecXdata, __fp16 *vecYdata,
+ unsigned int dim1, unsigned int dim2,
+ RunLayerContext &context) {
+
+ bool result = false;
+
+ do {
+ result = context.clCreateKernel(swiglu_cl_kernel_fp16_,
+ context.LayerKernel::SWIGLU_FP16,
+ SwiGLULayerCl::kernel_swiglu_fp16);
+ if (!result) {
+ break;
+ }
+
+ int dim = int(dim1 * dim2);
+ opencl::Buffer inputA(context.context_inst_, sizeof(__fp16) * dim1 * dim2, true,
+ nullptr);
+
+ opencl::Buffer inputX(context.context_inst_, sizeof(__fp16) * dim1 * dim2, true,
+ nullptr);
+
+ opencl::Buffer inOutY(context.context_inst_, sizeof(__fp16) * dim1 * dim2, true,
+ nullptr);
+
+ result = inputA.WriteData(context.command_queue_inst_, matAdata);
+ if (!result) {
+ break;
+ }
+
+ result = inputX.WriteData(context.command_queue_inst_, vecXdata);
+ if (!result) {
+ break;
+ }
+
+ result = inOutY.WriteData(context.command_queue_inst_, vecYdata);
+ if (!result) {
+ break;
+ }
+
+ result = SwiGLULayerCl::kernel_swiglu_fp16.SetKernelArguments(
+ 0, &inputA, sizeof(cl_mem));
+ if (!result) {
+ break;
+ }
+
+ result = SwiGLULayerCl::kernel_swiglu_fp16.SetKernelArguments(
+ 1, &inputX, sizeof(cl_mem));
+ if (!result) {
+ break;
+ }
+
+ result = SwiGLULayerCl::kernel_swiglu_fp16.SetKernelArguments(
+ 2, &inOutY, sizeof(cl_mem));
+ if (!result) {
+ break;
+ }
+
+ const int work_groups_count[3] = {dim, 1, 1};
+ const int work_group_size[3] = {32, 32, 1}; // test-value
+
+ result = context.command_queue_inst_.DispatchCommand(
+ SwiGLULayerCl::kernel_swiglu_fp16, work_groups_count, work_group_size);
+ if (!result) {
+ break;
+ }
+
+ result = inOutY.ReadData(context.command_queue_inst_, vecYdata);
+ if (!result) {
+ break;
+ }
+
+ } while (false);
+}
+
+void SwiGLULayerCl::calcDerivative(nntrainer::RunLayerContext &context) {
+ std::throw_with_nested(std::runtime_error("Training is not supported yet."));
+}
+
+void SwiGLULayerCl::setProperty(const std::vector<std::string> &values) {
+ auto remain_props = loadProperties(values, swiglu_props);
+ if (!remain_props.empty()) {
+ std::string msg = "[SwigluLayerCl] Unknown Layer Properties count " +
+ std::to_string(values.size());
+ throw exception::not_supported(msg);
+ }
+}
+
+#ifdef PLUGGABLE
+
+Layer *create_swiglu_layer_cl() {
+ auto layer = new SwiGLULayerCl();
+ return layer;
+}
+
+void destroy_swiglu_layer_cl(Layer *layer) {
+ delete layer;
+}
+
+extern "C" {
+LayerPluggable ml_train_layer_pluggable{create_swiglu_layer_cl,
+ destroy_swiglu_layer_cl};
+}
+
+#endif
+} // namespace nntrainer
--- /dev/null
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2024 Niket Agarwal <niket.a@samsung.com>
+ *
+ * @file swiglu_cl.h
+ * @date 6th June 2024
+ * @brief Implementation of SwiGLU activation function
+ * @see https://github.com/nnstreamer/nntrainer
+ * @author Niket Agarwal <niket.a@samsung.com>
+ * @bug No known bugs except for NYI items
+ *
+ */
+
+#ifndef __SWIGLU_LAYER_CL_H__
+#define __SWIGLU_LAYER_CL_H__
+
+#include <layer_context.h>
+#include <layer_devel.h>
+#include <node_exporter.h>
+
+#include <common_properties.h>
+#include <layer_impl.h>
+#include <opencl_buffer.h>
+#include <opencl_kernel.h>
+#include <utility>
+
+namespace nntrainer {
+
+/**
+ * @brief A SwiGLU layer
+ *
+ */
+class SwiGLULayerCl final : public Layer {
+public:
+ /**
+ * @brief Construct a new SwiGLU layer object
+ *
+ */
+ SwiGLULayerCl() : Layer(), swiglu_props(props::Print()) {}
+
+ /**
+ * @brief Destroy the SwiGLU layer object
+ *
+ */
+ ~SwiGLULayerCl() {}
+
+ /**
+ * @copydoc Layer::finalize(InitLayerContext &context)
+ */
+ void finalize(InitLayerContext &context) override;
+
+ /**
+ * @copydoc Layer::forwarding(RunLayerContext &context, bool training)
+ */
+ void forwarding(RunLayerContext &context, bool training) override;
+
+ /**
+ * @copydoc Layer::incremental_forwarding(RunLayerContext &context, unsigned
+ * int from, unsigned int to, bool training)
+ */
+ void incremental_forwarding(RunLayerContext &context, unsigned int from,
+ unsigned int to, bool training) override;
+
+ /**
+ * @copydoc Layer::calcDerivative(RunLayerContext &context)
+ */
+ void calcDerivative(RunLayerContext &context) override;
+
+ /**
+ * @copydoc bool supportBackwarding() const
+ */
+ bool supportBackwarding() const override { return true; };
+
+ /**
+ * @copydoc Layer::exportTo(Exporter &exporter, ExportMethods method)
+ */
+ void exportTo(Exporter &exporter,
+ const ml::train::ExportMethods &method) const override {};
+
+ /**
+ * @copydoc Layer::getType()
+ */
+ const std::string getType() const override { return SwiGLULayerCl::type; };
+
+ /**
+ * @copydoc Layer::setProperty(const std::vector<std::string> &values)
+ */
+ void setProperty(const std::vector<std::string> &values) override;
+
+ inline static const std::string type = "swiglu";
+
+ static opencl::Kernel kernel_swiglu;
+ static opencl::Kernel kernel_swiglu_fp16;
+
+ std::tuple<props::Print> swiglu_props; /**< swiglu layer properties : unit -
+ number of output neurons */
+
+ /**
+ * @brief Process data and dimensions for swiglu operation
+ * @param[in] input1 Tensor
+ * @param[in] input2 Tensor
+ * @param[in] result Tensor
+ * @param[in] RunLayerContext reference
+ */
+ void swigluProcess(Tensor const &in1, Tensor const &in2, Tensor &result,
+ RunLayerContext &context);
+
+ /**
+ * @brief swiglu computation
+ * @param[in] matAdata float * for Input Vector A
+ * @param[in] vecXdata float * for Input Vector X
+ * @param[in] vecYdata float * for Output Vector Y
+ * @param[in] dim1 number of elements in input vector A
+ * @param[in] dim1 number of elements in input vector X
+ * @param[in] context RunLayerContext reference
+ */
+ void swiglu_cl(const float *matAdata, const float *vecXdata, float *vecYdata,
+ unsigned int dim1, unsigned int dim2,
+ RunLayerContext &context);
+
+ /**
+ * @brief fp16 swiglu computation
+ * @param[in] matAdata fp16 * for Input Vector A
+ * @param[in] vecXdata fp16 * for Input Vector X
+ * @param[in] vecYdata fp16 * for Output Vector Y
+ * @param[in] dim1 number of elements in input vector A
+ * @param[in] dim1 number of elements in input vector X
+ * @param[in] context RunLayerContext reference
+ */
+ void swiglu_cl_fp16(const __fp16 *matAdata, const __fp16 *vecXdata,
+ __fp16 *vecYdata, unsigned int dim1, unsigned int dim2,
+ RunLayerContext &context);
+};
+
+} // namespace nntrainer
+
+#endif /* __SWIGLU_LAYER_CL_H__ */
* @see https://github.com/nnstreamer/nntrainer
* @author Parichay Kapoor <pk.kapoor@samsung.com>
* @author Debadri Samaddar <s.debadri@samsung.com>
+ * @author Niket Agarwal <niket.a@samsung.com>
* @bug No known bugs except for NYI items
* @brief This is the layer context for each layer
*/
return "addition_cl";
case LayerKernel::ADD_FP16:
return "addition_cl_fp16";
+ case LayerKernel::SWIGLU:
+ return "swiglu_cl";
+ case LayerKernel::SWIGLU_FP16:
+ return "swiglu_cl_fp16";
default:
return "";
}
* @see https://github.com/nnstreamer/nntrainer
* @author Parichay Kapoor <pk.kapoor@samsung.com>
* @author Debadri Samaddar <s.debadri@samsung.com>
+ * @author Niket Agarwal <niket.a@samsung.com>
* @bug No known bugs except for NYI items
* @brief This is the layer context for each layer
*/
SGEMV_FP16 = 1 << 3, /**< placeholder for kernel name */
DOT_FP16 = 1 << 4, /**< placeholder for kernel name */
SGEMM_FP16 = 1 << 5, /**< placeholder for kernel name */
- ADD = 1 << 6, /**< placeholder for kernel name */
- ADD_FP16 = 1 << 7 /**< placeholder for kernel name */
+ ADD = 1 << 6, /**< placeholder for kernel name */
+ ADD_FP16 = 1 << 7, /**< placeholder for kernel name */
+ SWIGLU = 1 << 8, /**< placeholder for kernel name */
+ SWIGLU_FP16 = 1 << 9 /**< placeholder for kernel name */
};
/**
@author Jihoon Lee <jhoon.it.lee@samsung.com>
@author Sungsik Kong <ss.kong@samsung.com>
@author Debadri Samaddar <s.debadri@samsung.com>
+@author Niket Agarwal <niket.a@samsung.com>
"""
import warnings
added = K.layers.Add()
record_single(added, [(3, 4, 3, 4), (3, 4, 3, 4)], "added_w32a32_2")
+
+ record_single_fp16(
+ swiglu_layer,
+ [(2, 3, 3, 3), (2, 3, 3, 3)],
+ "swiglu",
+ input_type="float",
+ )
../unittest/layers/unittest_layer_node.cpp \
../unittest/layers/unittest_layers.cpp \
../unittest/layers/unittest_layers_impl.cpp \
+ ../unittest/layers/unittest_layers_swiglu_cl.cpp \
+ ../unittest/layers/unittest_layers_fully_connected_cl.cpp \
../unittest/layers/unittest_layers_input.cpp \
../unittest/layers/unittest_layers_loss.cpp \
- ../unittest/layers/unittest_layers_fully_connected_cl.cpp \
../unittest/layers/unittest_layers_fully_connected.cpp \
../unittest/layers/unittest_layers_batch_normalization.cpp \
../unittest/layers/unittest_layers_layer_normalization.cpp \
--- /dev/null
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2024 Niket Agarwal <niket.a@samsung.com>
+ *
+ * @file unittest_layers_swiglu_cl.cpp
+ * @date 6th June 2024
+ * @brief Swiglu Layer Test
+ * @see https://github.com/nnstreamer/nntrainer
+ * @author Niket Agarwal <niket.a@samsung.com>
+ * @bug No known bugs except for NYI items
+ */
+#include <tuple>
+
+#include <gtest/gtest.h>
+
+#include <layers_common_tests.h>
+#include <swiglu_cl.h>
+
+auto semantic_swiglu_gpu = LayerSemanticsParamType(
+ nntrainer::createLayer<nntrainer::SwiGLULayerCl>,
+ nntrainer::SwiGLULayerCl::type, {},
+ LayerCreateSetPropertyOptions::AVAILABLE_FROM_APP_CONTEXT, false, 1);
+
+GTEST_PARAMETER_TEST(SwigluGPU, LayerSemanticsGpu,
+ ::testing::Values(semantic_swiglu_gpu));
+
+auto swiglu_basic_plain =
+ LayerGoldenTestParamType(nntrainer::createLayer<nntrainer::SwiGLULayerCl>, {},
+ "2:3:3:3,2:3:3:3", "swiglu.nnlayergolden",
+ LayerGoldenTestParamOptions::SKIP_CALC_DERIV |
+ LayerGoldenTestParamOptions::SKIP_CALC_GRAD |
+ LayerGoldenTestParamOptions::USE_INC_FORWARD,
+ "nchw", "fp32", "fp32");
+
+GTEST_PARAMETER_TEST(SwigluGPU, LayerGoldenTest,
+ ::testing::Values(swiglu_basic_plain));
+
+#ifdef ENABLE_FP16
+auto swiglu_basic_plain_w16a16 =
+ LayerGoldenTestParamType(nntrainer::createLayer<nntrainer::SwiGLULayerCl>, {},
+ "2:3:3:3,2:3:3:3", "swiglufp16.nnlayergolden",
+ LayerGoldenTestParamOptions::SKIP_CALC_DERIV |
+ LayerGoldenTestParamOptions::SKIP_CALC_GRAD |
+ LayerGoldenTestParamOptions::USE_INC_FORWARD,
+ "nchw", "fp16", "fp16");
+
+GTEST_PARAMETER_TEST(SwigluGPU16, LayerGoldenTest,
+ ::testing::Values(swiglu_basic_plain_w16a16));
+#endif