[GPU/OpenCL] Updated the SwiGLU, Reshape and Concat Layers
authorNiket Agarwal <niket.a@samsung.com>
Wed, 9 Oct 2024 07:34:20 +0000 (13:04 +0530)
committerJijoong Moon <jijoong.moon@samsung.com>
Mon, 14 Oct 2024 11:21:21 +0000 (20:21 +0900)
Updated the swiglu, reshape, and concat layers with the new shared_ptr flow.
Replaced clCreateKernel with registerClKernel for all these layers.

Self evaluation:

Build test: [X]Passed [ ]Failed [ ]Skipped
        Run test: [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: Niket Agarwal <niket.a@samsung.com>
nntrainer/cl_context.cpp
nntrainer/layers/cl_layers/concat_cl.cpp
nntrainer/layers/cl_layers/concat_cl.h
nntrainer/layers/cl_layers/meson.build
nntrainer/layers/cl_layers/reshape_cl.cpp
nntrainer/layers/cl_layers/reshape_cl.h
nntrainer/layers/cl_layers/swiglu_cl.cpp
nntrainer/layers/cl_layers/swiglu_cl.h

index f32c8301d9381ff1fb0c6bd482725746394c251d..818a77e0d39efee40ae3f0263540d8c6470ec809 100644 (file)
@@ -39,21 +39,18 @@ static void add_default_object(ClContext &cc) {
   //                    AdditionLayerCL::type,
   //                    ml::train::LayerType::LAYER_ADDITION);
 
-  // cc.registerFactory(nntrainer::createLayer<SwiGLULayerCl>,
-  // SwiGLULayerCl::type,
-  //                    ml::train::LayerType::LAYER_SWIGLU);
+  cc.registerFactory(nntrainer::createLayer<SwiGLULayerCl>, SwiGLULayerCl::type,
+                     ml::train::LayerType::LAYER_SWIGLU);
 
-  // cc.registerFactory(nntrainer::createLayer<ReshapeLayerCl>,
-  //                    ReshapeLayerCl::type,
-  //                    ml::train::LayerType::LAYER_RESHAPE);
+  cc.registerFactory(nntrainer::createLayer<ReshapeLayerCl>,
+                     ReshapeLayerCl::type, ml::train::LayerType::LAYER_RESHAPE);
 
   // cc.registerFactory(nntrainer::createLayer<RMSNormLayerCl>,
   //                    RMSNormLayerCl::type,
   //                    ml::train::LayerType::LAYER_RMSNORM);
 
-  // cc.registerFactory(nntrainer::createLayer<ConcatLayerCl>,
-  // ConcatLayerCl::type,
-  //                    ml::train::LayerType::LAYER_CONCAT);
+  cc.registerFactory(nntrainer::createLayer<ConcatLayerCl>, ConcatLayerCl::type,
+                     ml::train::LayerType::LAYER_CONCAT);
 }
 
 static void registerer(ClContext &cc) noexcept {
index 12c2099ea04eb5eb0e07e4a5409c8281e7769ee5..9df692d9c1fa328b4392c87d71957c0d9871cbc5 100644 (file)
@@ -284,7 +284,7 @@ void ConcatLayerCl::forwarding(RunLayerContext &context, bool training) {
   Tensor &out = context.getOutput(SINGLE_INOUT_IDX);
   const Tensor &in1 = context.getInput(INPUT_IDX_1);
   const Tensor &in2 = context.getInput(INPUT_IDX_2);
-  ConcatProcess(in1, in2, out, context);
+  ConcatProcess(in1, in2, out);
 }
 
 void ConcatLayerCl::incremental_forwarding(RunLayerContext &context,
@@ -299,7 +299,7 @@ void ConcatLayerCl::incremental_forwarding(RunLayerContext &context,
     from = 0;
     to = 1;
   }
-  ConcatProcess(in1, in2, out, context);
+  ConcatProcess(in1, in2, out);
 }
 
 opencl::Kernel ConcatLayerCl::kernel_concat_axis3;
@@ -310,7 +310,7 @@ opencl::Kernel ConcatLayerCl::kernel_concat_axis1;
 opencl::Kernel ConcatLayerCl::kernel_concat_axis1_fp16;
 
 void ConcatLayerCl::ConcatProcess(Tensor const &in1, Tensor const &in2,
-                                  Tensor &result, RunLayerContext &context) {
+                                  Tensor &result) {
 
   unsigned int input1_batch_size, input1_height, input1_width, input1_channels,
     input2_batch_size, input2_height, input2_width, input2_channels;
@@ -332,13 +332,13 @@ void ConcatLayerCl::ConcatProcess(Tensor const &in1, Tensor const &in2,
     float *rdata = result.getData();
     if (input1_width != input2_width) {
       concat_cl_axis3(data1, data2, rdata, input1_batch_size, input1_channels,
-                      input1_height, input1_width, input2_width, context);
+                      input1_height, input1_width, input2_width);
     } else if (input1_height != input2_height) {
       concat_cl_axis2(data1, data2, rdata, input1_batch_size, input1_channels,
-                      input1_width, input1_height, input2_height, context);
+                      input1_width, input1_height, input2_height);
     } else if (input1_channels != input2_channels) {
       concat_cl_axis1(data1, data2, rdata, input1_batch_size, input1_height,
-                      input1_width, input1_channels, input2_channels, context);
+                      input1_width, input1_channels, input2_channels);
     }
   } else if (in1.getDataType() == ml::train::TensorDim::DataType::FP16) {
 #ifdef ENABLE_FP16
@@ -348,15 +348,15 @@ void ConcatLayerCl::ConcatProcess(Tensor const &in1, Tensor const &in2,
     if (input1_width != input2_width) {
       concat_cl_axis3_fp16(data1, data2, rdata, input1_batch_size,
                            input1_channels, input1_height, input1_width,
-                           input2_width, context);
+                           input2_width);
     } else if (input1_height != input2_height) {
       concat_cl_axis2_fp16(data1, data2, rdata, input1_batch_size,
                            input1_channels, input1_width, input1_height,
-                           input2_height, context);
+                           input2_height);
     } else if (input1_channels != input2_channels) {
       concat_cl_axis1_fp16(data1, data2, rdata, input1_batch_size,
                            input1_height, input1_width, input1_channels,
-                           input2_channels, context);
+                           input2_channels);
     }
 #else
     throw std::invalid_argument("Error: enable-fp16 is not enabled");
@@ -364,99 +364,98 @@ void ConcatLayerCl::ConcatProcess(Tensor const &in1, Tensor const &in2,
   }
 }
 
-void ConcatLayerCl::concat_cl_axis3(
-  const float *matAdata, const float *vecXdata, float *vecYdata,
-  unsigned int input1_batch_size, unsigned int input1_channels,
-  unsigned int input1_height, unsigned int input1_width,
-  unsigned int input2_width, RunLayerContext &context) {
+void ConcatLayerCl::concat_cl_axis3(const float *matAdata,
+                                    const float *vecXdata, float *vecYdata,
+                                    unsigned int input1_batch_size,
+                                    unsigned int input1_channels,
+                                    unsigned int input1_height,
+                                    unsigned int input1_width,
+                                    unsigned int input2_width) {
 
   bool result = false;
 
   do {
-    result = context.clCreateKernel(concat_cl_axis3_kernel_,
-                                    context.LayerKernel::CONCAT_AXIS3,
-                                    ConcatLayerCl::kernel_concat_axis3);
-    if (!result) {
+    ClContext::SharedPtrClKernel kernel_concat_ptr =
+      cl_context_ref.registerClKernel(concat_cl_axis3_kernel_,
+                                      "concat_cl_axis3");
+    if (!kernel_concat_ptr) {
       break;
     }
 
     int dim = int(input1_batch_size * input1_channels * input1_height *
                   (input1_width + input2_width));
 
-    opencl::Buffer inputA(context.context_inst_,
+    opencl::Buffer inputA(cl_context_ref.context_inst_,
                           sizeof(float) * input1_batch_size * input1_channels *
                             input1_height * input1_width,
                           true, nullptr);
 
-    opencl::Buffer inputX(context.context_inst_,
+    opencl::Buffer inputX(cl_context_ref.context_inst_,
                           sizeof(float) * input1_batch_size * input1_channels *
                             input1_height * input2_width,
                           true, nullptr);
 
-    opencl::Buffer inOutY(context.context_inst_,
+    opencl::Buffer inOutY(cl_context_ref.context_inst_,
                           sizeof(float) * input1_batch_size * input1_channels *
                             input1_height * (input1_width + input2_width),
                           true, nullptr);
 
-    result = inputA.WriteData(context.command_queue_inst_, matAdata);
+    result = inputA.WriteData(cl_context_ref.command_queue_inst_, matAdata);
     if (!result) {
       break;
     }
 
-    result = inputX.WriteData(context.command_queue_inst_, vecXdata);
+    result = inputX.WriteData(cl_context_ref.command_queue_inst_, vecXdata);
     if (!result) {
       break;
     }
 
-    result = inOutY.WriteData(context.command_queue_inst_, vecYdata);
+    result = inOutY.WriteData(cl_context_ref.command_queue_inst_, vecYdata);
     if (!result) {
       break;
     }
 
-    result = ConcatLayerCl::kernel_concat_axis3.SetKernelArguments(
-      0, &inputA, sizeof(cl_mem));
+    result = kernel_concat_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem));
     if (!result) {
       break;
     }
 
-    result = ConcatLayerCl::kernel_concat_axis3.SetKernelArguments(
-      1, &inputX, sizeof(cl_mem));
+    result = kernel_concat_ptr->SetKernelArguments(1, &inputX, sizeof(cl_mem));
     if (!result) {
       break;
     }
 
-    result = ConcatLayerCl::kernel_concat_axis3.SetKernelArguments(
-      2, &inOutY, sizeof(cl_mem));
+    result = kernel_concat_ptr->SetKernelArguments(2, &inOutY, sizeof(cl_mem));
     if (!result) {
       break;
     }
 
-    result = ConcatLayerCl::kernel_concat_axis3.SetKernelArguments(
-      3, &input1_batch_size, sizeof(int));
+    result =
+      kernel_concat_ptr->SetKernelArguments(3, &input1_batch_size, sizeof(int));
     if (!result) {
       break;
     }
 
-    result = ConcatLayerCl::kernel_concat_axis3.SetKernelArguments(
-      4, &input1_channels, sizeof(int));
+    result =
+      kernel_concat_ptr->SetKernelArguments(4, &input1_channels, sizeof(int));
     if (!result) {
       break;
     }
 
-    result = ConcatLayerCl::kernel_concat_axis3.SetKernelArguments(
-      5, &input1_height, sizeof(int));
+    result =
+      kernel_concat_ptr->SetKernelArguments(5, &input1_height, sizeof(int));
     if (!result) {
       break;
     }
 
-    result = ConcatLayerCl::kernel_concat_axis3.SetKernelArguments(
-      6, &input1_width, sizeof(int));
+    result =
+      kernel_concat_ptr->SetKernelArguments(6, &input1_width, sizeof(int));
     if (!result) {
       break;
     }
 
-    result = ConcatLayerCl::kernel_concat_axis3.SetKernelArguments(
-      7, &input2_width, sizeof(int));
+    result =
+      kernel_concat_ptr->SetKernelArguments(7, &input2_width, sizeof(int));
     if (!result) {
       break;
     }
@@ -464,13 +463,13 @@ void ConcatLayerCl::concat_cl_axis3(
     const int work_groups_count[3] = {dim, 1, 1};
     const int work_group_size[3] = {32, 32, 1}; // test-value
 
-    result = context.command_queue_inst_.DispatchCommand(
-      ConcatLayerCl::kernel_concat_axis3, work_groups_count, work_group_size);
+    result = cl_context_ref.command_queue_inst_.DispatchCommand(
+      kernel_concat_ptr, work_groups_count, work_group_size);
     if (!result) {
       break;
     }
 
-    result = inOutY.ReadData(context.command_queue_inst_, vecYdata);
+    result = inOutY.ReadData(cl_context_ref.command_queue_inst_, vecYdata);
     if (!result) {
       break;
     }
@@ -482,95 +481,92 @@ void ConcatLayerCl::concat_cl_axis3_fp16(
   const __fp16 *matAdata, const __fp16 *vecXdata, __fp16 *vecYdata,
   unsigned int input1_batch_size, unsigned int input1_channels,
   unsigned int input1_height, unsigned int input1_width,
-  unsigned int input2_width, RunLayerContext &context) {
+  unsigned int input2_width) {
 
   bool result = false;
 
   do {
-    result = context.clCreateKernel(concat_cl_axis3_kernel_fp16_,
-                                    context.LayerKernel::CONCAT_AXIS3_FP16,
-                                    ConcatLayerCl::kernel_concat_axis3_fp16);
-    if (!result) {
+    ClContext::SharedPtrClKernel kernel_concat_ptr =
+      cl_context_ref.registerClKernel(concat_cl_axis3_kernel_fp16_,
+                                      "concat_cl_axis3_fp16");
+    if (!kernel_concat_ptr) {
       break;
     }
 
     int dim = int(input1_batch_size * input1_channels * input1_height *
                   (input1_width + input2_width));
 
-    opencl::Buffer inputA(context.context_inst_,
+    opencl::Buffer inputA(cl_context_ref.context_inst_,
                           sizeof(__fp16) * input1_batch_size * input1_channels *
                             input1_height * input1_width,
                           true, nullptr);
 
-    opencl::Buffer inputX(context.context_inst_,
+    opencl::Buffer inputX(cl_context_ref.context_inst_,
                           sizeof(__fp16) * input1_batch_size * input1_channels *
                             input1_height * input2_width,
                           true, nullptr);
 
-    opencl::Buffer inOutY(context.context_inst_,
+    opencl::Buffer inOutY(cl_context_ref.context_inst_,
                           sizeof(__fp16) * input1_batch_size * input1_channels *
                             input1_height * (input1_width + input2_width),
                           true, nullptr);
 
-    result = inputA.WriteData(context.command_queue_inst_, matAdata);
+    result = inputA.WriteData(cl_context_ref.command_queue_inst_, matAdata);
     if (!result) {
       break;
     }
 
-    result = inputX.WriteData(context.command_queue_inst_, vecXdata);
+    result = inputX.WriteData(cl_context_ref.command_queue_inst_, vecXdata);
     if (!result) {
       break;
     }
 
-    result = inOutY.WriteData(context.command_queue_inst_, vecYdata);
+    result = inOutY.WriteData(cl_context_ref.command_queue_inst_, vecYdata);
     if (!result) {
       break;
     }
 
-    result = ConcatLayerCl::kernel_concat_axis3_fp16.SetKernelArguments(
-      0, &inputA, sizeof(cl_mem));
+    result = kernel_concat_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem));
     if (!result) {
       break;
     }
 
-    result = ConcatLayerCl::kernel_concat_axis3_fp16.SetKernelArguments(
-      1, &inputX, sizeof(cl_mem));
+    result = kernel_concat_ptr->SetKernelArguments(1, &inputX, sizeof(cl_mem));
     if (!result) {
       break;
     }
 
-    result = ConcatLayerCl::kernel_concat_axis3_fp16.SetKernelArguments(
-      2, &inOutY, sizeof(cl_mem));
+    result = kernel_concat_ptr->SetKernelArguments(2, &inOutY, sizeof(cl_mem));
     if (!result) {
       break;
     }
 
-    result = ConcatLayerCl::kernel_concat_axis3_fp16.SetKernelArguments(
-      3, &input1_batch_size, sizeof(int));
+    result =
+      kernel_concat_ptr->SetKernelArguments(3, &input1_batch_size, sizeof(int));
     if (!result) {
       break;
     }
 
-    result = ConcatLayerCl::kernel_concat_axis3_fp16.SetKernelArguments(
-      4, &input1_channels, sizeof(int));
+    result =
+      kernel_concat_ptr->SetKernelArguments(4, &input1_channels, sizeof(int));
     if (!result) {
       break;
     }
 
-    result = ConcatLayerCl::kernel_concat_axis3_fp16.SetKernelArguments(
-      5, &input1_height, sizeof(int));
+    result =
+      kernel_concat_ptr->SetKernelArguments(5, &input1_height, sizeof(int));
     if (!result) {
       break;
     }
 
-    result = ConcatLayerCl::kernel_concat_axis3_fp16.SetKernelArguments(
-      6, &input1_width, sizeof(int));
+    result =
+      kernel_concat_ptr->SetKernelArguments(6, &input1_width, sizeof(int));
     if (!result) {
       break;
     }
 
-    result = ConcatLayerCl::kernel_concat_axis3_fp16.SetKernelArguments(
-      7, &input2_width, sizeof(int));
+    result =
+      kernel_concat_ptr->SetKernelArguments(7, &input2_width, sizeof(int));
     if (!result) {
       break;
     }
@@ -578,14 +574,13 @@ void ConcatLayerCl::concat_cl_axis3_fp16(
     const int work_groups_count[3] = {dim, 1, 1};
     const int work_group_size[3] = {32, 32, 1}; // test-value
 
-    result = context.command_queue_inst_.DispatchCommand(
-      ConcatLayerCl::kernel_concat_axis3_fp16, work_groups_count,
-      work_group_size);
+    result = cl_context_ref.command_queue_inst_.DispatchCommand(
+      kernel_concat_ptr, work_groups_count, work_group_size);
     if (!result) {
       break;
     }
 
-    result = inOutY.ReadData(context.command_queue_inst_, vecYdata);
+    result = inOutY.ReadData(cl_context_ref.command_queue_inst_, vecYdata);
     if (!result) {
       break;
     }
@@ -593,99 +588,98 @@ void ConcatLayerCl::concat_cl_axis3_fp16(
   } while (false);
 }
 
-void ConcatLayerCl::concat_cl_axis2(
-  const float *matAdata, const float *vecXdata, float *vecYdata,
-  unsigned int input1_batch_size, unsigned int input1_channels,
-  unsigned int input1_width, unsigned int input1_height,
-  unsigned int input2_height, RunLayerContext &context) {
+void ConcatLayerCl::concat_cl_axis2(const float *matAdata,
+                                    const float *vecXdata, float *vecYdata,
+                                    unsigned int input1_batch_size,
+                                    unsigned int input1_channels,
+                                    unsigned int input1_width,
+                                    unsigned int input1_height,
+                                    unsigned int input2_height) {
 
   bool result = false;
 
   do {
-    result = context.clCreateKernel(concat_cl_axis2_kernel_,
-                                    context.LayerKernel::CONCAT_AXIS2,
-                                    ConcatLayerCl::kernel_concat_axis2);
-    if (!result) {
+    ClContext::SharedPtrClKernel kernel_concat_ptr =
+      cl_context_ref.registerClKernel(concat_cl_axis2_kernel_,
+                                      "concat_cl_axis2");
+    if (!kernel_concat_ptr) {
       break;
     }
 
     int dim = int(input1_batch_size * input1_channels * input1_width *
                   (input1_height + input2_height));
 
-    opencl::Buffer inputA(context.context_inst_,
+    opencl::Buffer inputA(cl_context_ref.context_inst_,
                           sizeof(float) * input1_batch_size * input1_channels *
                             input1_height * input1_width,
                           true, nullptr);
 
-    opencl::Buffer inputX(context.context_inst_,
+    opencl::Buffer inputX(cl_context_ref.context_inst_,
                           sizeof(float) * input1_batch_size * input1_channels *
                             input2_height * input1_width,
                           true, nullptr);
 
-    opencl::Buffer inOutY(context.context_inst_,
+    opencl::Buffer inOutY(cl_context_ref.context_inst_,
                           sizeof(float) * input1_batch_size * input1_channels *
                             (input1_height + input2_height) * input1_width,
                           true, nullptr);
 
-    result = inputA.WriteData(context.command_queue_inst_, matAdata);
+    result = inputA.WriteData(cl_context_ref.command_queue_inst_, matAdata);
     if (!result) {
       break;
     }
 
-    result = inputX.WriteData(context.command_queue_inst_, vecXdata);
+    result = inputX.WriteData(cl_context_ref.command_queue_inst_, vecXdata);
     if (!result) {
       break;
     }
 
-    result = inOutY.WriteData(context.command_queue_inst_, vecYdata);
+    result = inOutY.WriteData(cl_context_ref.command_queue_inst_, vecYdata);
     if (!result) {
       break;
     }
 
-    result = ConcatLayerCl::kernel_concat_axis2.SetKernelArguments(
-      0, &inputA, sizeof(cl_mem));
+    result = kernel_concat_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem));
     if (!result) {
       break;
     }
 
-    result = ConcatLayerCl::kernel_concat_axis2.SetKernelArguments(
-      1, &inputX, sizeof(cl_mem));
+    result = kernel_concat_ptr->SetKernelArguments(1, &inputX, sizeof(cl_mem));
     if (!result) {
       break;
     }
 
-    result = ConcatLayerCl::kernel_concat_axis2.SetKernelArguments(
-      2, &inOutY, sizeof(cl_mem));
+    result = kernel_concat_ptr->SetKernelArguments(2, &inOutY, sizeof(cl_mem));
     if (!result) {
       break;
     }
 
-    result = ConcatLayerCl::kernel_concat_axis2.SetKernelArguments(
-      3, &input1_batch_size, sizeof(int));
+    result =
+      kernel_concat_ptr->SetKernelArguments(3, &input1_batch_size, sizeof(int));
     if (!result) {
       break;
     }
 
-    result = ConcatLayerCl::kernel_concat_axis2.SetKernelArguments(
-      4, &input1_channels, sizeof(int));
+    result =
+      kernel_concat_ptr->SetKernelArguments(4, &input1_channels, sizeof(int));
     if (!result) {
       break;
     }
 
-    result = ConcatLayerCl::kernel_concat_axis2.SetKernelArguments(
-      5, &input1_height, sizeof(int));
+    result =
+      kernel_concat_ptr->SetKernelArguments(5, &input1_height, sizeof(int));
     if (!result) {
       break;
     }
 
-    result = ConcatLayerCl::kernel_concat_axis2.SetKernelArguments(
-      6, &input2_height, sizeof(int));
+    result =
+      kernel_concat_ptr->SetKernelArguments(6, &input2_height, sizeof(int));
     if (!result) {
       break;
     }
 
-    result = ConcatLayerCl::kernel_concat_axis2.SetKernelArguments(
-      7, &input1_width, sizeof(int));
+    result =
+      kernel_concat_ptr->SetKernelArguments(7, &input1_width, sizeof(int));
     if (!result) {
       break;
     }
@@ -693,13 +687,13 @@ void ConcatLayerCl::concat_cl_axis2(
     const int work_groups_count[3] = {dim, 1, 1};
     const int work_group_size[3] = {32, 32, 1}; // test-value
 
-    result = context.command_queue_inst_.DispatchCommand(
-      ConcatLayerCl::kernel_concat_axis2, work_groups_count, work_group_size);
+    result = cl_context_ref.command_queue_inst_.DispatchCommand(
+      kernel_concat_ptr, work_groups_count, work_group_size);
     if (!result) {
       break;
     }
 
-    result = inOutY.ReadData(context.command_queue_inst_, vecYdata);
+    result = inOutY.ReadData(cl_context_ref.command_queue_inst_, vecYdata);
     if (!result) {
       break;
     }
@@ -711,95 +705,92 @@ void ConcatLayerCl::concat_cl_axis2_fp16(
   const __fp16 *matAdata, const __fp16 *vecXdata, __fp16 *vecYdata,
   unsigned int input1_batch_size, unsigned int input1_channels,
   unsigned int input1_width, unsigned int input1_height,
-  unsigned int input2_height, RunLayerContext &context) {
+  unsigned int input2_height) {
 
   bool result = false;
 
   do {
-    result = context.clCreateKernel(concat_cl_axis2_kernel_fp16_,
-                                    context.LayerKernel::CONCAT_AXIS2_FP16,
-                                    ConcatLayerCl::kernel_concat_axis2_fp16);
-    if (!result) {
+    ClContext::SharedPtrClKernel kernel_concat_ptr =
+      cl_context_ref.registerClKernel(concat_cl_axis2_kernel_fp16_,
+                                      "concat_cl_axis2_fp16");
+    if (!kernel_concat_ptr) {
       break;
     }
 
     int dim = int(input1_batch_size * input1_channels * input1_width *
                   (input1_height + input2_height));
 
-    opencl::Buffer inputA(context.context_inst_,
+    opencl::Buffer inputA(cl_context_ref.context_inst_,
                           sizeof(__fp16) * input1_batch_size * input1_channels *
                             input1_height * input1_width,
                           true, nullptr);
 
-    opencl::Buffer inputX(context.context_inst_,
+    opencl::Buffer inputX(cl_context_ref.context_inst_,
                           sizeof(__fp16) * input1_batch_size * input1_channels *
                             input2_height * input1_width,
                           true, nullptr);
 
-    opencl::Buffer inOutY(context.context_inst_,
+    opencl::Buffer inOutY(cl_context_ref.context_inst_,
                           sizeof(__fp16) * input1_batch_size * input1_channels *
                             (input1_height + input2_height) * input1_width,
                           true, nullptr);
 
-    result = inputA.WriteData(context.command_queue_inst_, matAdata);
+    result = inputA.WriteData(cl_context_ref.command_queue_inst_, matAdata);
     if (!result) {
       break;
     }
 
-    result = inputX.WriteData(context.command_queue_inst_, vecXdata);
+    result = inputX.WriteData(cl_context_ref.command_queue_inst_, vecXdata);
     if (!result) {
       break;
     }
 
-    result = inOutY.WriteData(context.command_queue_inst_, vecYdata);
+    result = inOutY.WriteData(cl_context_ref.command_queue_inst_, vecYdata);
     if (!result) {
       break;
     }
 
-    result = ConcatLayerCl::kernel_concat_axis2_fp16.SetKernelArguments(
-      0, &inputA, sizeof(cl_mem));
+    result = kernel_concat_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem));
     if (!result) {
       break;
     }
 
-    result = ConcatLayerCl::kernel_concat_axis2_fp16.SetKernelArguments(
-      1, &inputX, sizeof(cl_mem));
+    result = kernel_concat_ptr->SetKernelArguments(1, &inputX, sizeof(cl_mem));
     if (!result) {
       break;
     }
 
-    result = ConcatLayerCl::kernel_concat_axis2_fp16.SetKernelArguments(
-      2, &inOutY, sizeof(cl_mem));
+    result = kernel_concat_ptr->SetKernelArguments(2, &inOutY, sizeof(cl_mem));
     if (!result) {
       break;
     }
 
-    result = ConcatLayerCl::kernel_concat_axis2_fp16.SetKernelArguments(
-      3, &input1_batch_size, sizeof(int));
+    result =
+      kernel_concat_ptr->SetKernelArguments(3, &input1_batch_size, sizeof(int));
     if (!result) {
       break;
     }
 
-    result = ConcatLayerCl::kernel_concat_axis2_fp16.SetKernelArguments(
-      4, &input1_channels, sizeof(int));
+    result =
+      kernel_concat_ptr->SetKernelArguments(4, &input1_channels, sizeof(int));
     if (!result) {
       break;
     }
 
-    result = ConcatLayerCl::kernel_concat_axis2_fp16.SetKernelArguments(
-      5, &input1_height, sizeof(int));
+    result =
+      kernel_concat_ptr->SetKernelArguments(5, &input1_height, sizeof(int));
     if (!result) {
       break;
     }
 
-    result = ConcatLayerCl::kernel_concat_axis2_fp16.SetKernelArguments(
-      6, &input2_height, sizeof(int));
+    result =
+      kernel_concat_ptr->SetKernelArguments(6, &input2_height, sizeof(int));
     if (!result) {
       break;
     }
 
-    result = ConcatLayerCl::kernel_concat_axis2_fp16.SetKernelArguments(
-      7, &input1_width, sizeof(int));
+    result =
+      kernel_concat_ptr->SetKernelArguments(7, &input1_width, sizeof(int));
     if (!result) {
       break;
     }
@@ -807,14 +798,13 @@ void ConcatLayerCl::concat_cl_axis2_fp16(
     const int work_groups_count[3] = {dim, 1, 1};
     const int work_group_size[3] = {32, 32, 1}; // test-value
 
-    result = context.command_queue_inst_.DispatchCommand(
-      ConcatLayerCl::kernel_concat_axis2_fp16, work_groups_count,
-      work_group_size);
+    result = cl_context_ref.command_queue_inst_.DispatchCommand(
+      kernel_concat_ptr, work_groups_count, work_group_size);
     if (!result) {
       break;
     }
 
-    result = inOutY.ReadData(context.command_queue_inst_, vecYdata);
+    result = inOutY.ReadData(cl_context_ref.command_queue_inst_, vecYdata);
     if (!result) {
       break;
     }
@@ -822,99 +812,98 @@ void ConcatLayerCl::concat_cl_axis2_fp16(
   } while (false);
 }
 
-void ConcatLayerCl::concat_cl_axis1(
-  const float *matAdata, const float *vecXdata, float *vecYdata,
-  unsigned int input1_batch_size, unsigned int input1_height,
-  unsigned int input1_width, unsigned int input1_channels,
-  unsigned int input2_channels, RunLayerContext &context) {
+void ConcatLayerCl::concat_cl_axis1(const float *matAdata,
+                                    const float *vecXdata, float *vecYdata,
+                                    unsigned int input1_batch_size,
+                                    unsigned int input1_height,
+                                    unsigned int input1_width,
+                                    unsigned int input1_channels,
+                                    unsigned int input2_channels) {
 
   bool result = false;
 
   do {
-    result = context.clCreateKernel(concat_cl_axis1_kernel_,
-                                    context.LayerKernel::CONCAT_AXIS1,
-                                    ConcatLayerCl::kernel_concat_axis1);
-    if (!result) {
+    ClContext::SharedPtrClKernel kernel_concat_ptr =
+      cl_context_ref.registerClKernel(concat_cl_axis1_kernel_,
+                                      "concat_cl_axis1");
+    if (!kernel_concat_ptr) {
       break;
     }
 
     int dim = int(input1_batch_size * input1_width * input1_height *
                   (input1_channels + input2_channels));
 
-    opencl::Buffer inputA(context.context_inst_,
+    opencl::Buffer inputA(cl_context_ref.context_inst_,
                           sizeof(float) * input1_batch_size * input1_channels *
                             input1_height * input1_width,
                           true, nullptr);
 
-    opencl::Buffer inputX(context.context_inst_,
+    opencl::Buffer inputX(cl_context_ref.context_inst_,
                           sizeof(float) * input1_batch_size * input2_channels *
                             input1_height * input1_width,
                           true, nullptr);
 
-    opencl::Buffer inOutY(context.context_inst_,
+    opencl::Buffer inOutY(cl_context_ref.context_inst_,
                           sizeof(float) * input1_batch_size * input1_width *
                             input1_height * (input1_channels + input2_channels),
                           true, nullptr);
 
-    result = inputA.WriteData(context.command_queue_inst_, matAdata);
+    result = inputA.WriteData(cl_context_ref.command_queue_inst_, matAdata);
     if (!result) {
       break;
     }
 
-    result = inputX.WriteData(context.command_queue_inst_, vecXdata);
+    result = inputX.WriteData(cl_context_ref.command_queue_inst_, vecXdata);
     if (!result) {
       break;
     }
 
-    result = inOutY.WriteData(context.command_queue_inst_, vecYdata);
+    result = inOutY.WriteData(cl_context_ref.command_queue_inst_, vecYdata);
     if (!result) {
       break;
     }
 
-    result = ConcatLayerCl::kernel_concat_axis1.SetKernelArguments(
-      0, &inputA, sizeof(cl_mem));
+    result = kernel_concat_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem));
     if (!result) {
       break;
     }
 
-    result = ConcatLayerCl::kernel_concat_axis1.SetKernelArguments(
-      1, &inputX, sizeof(cl_mem));
+    result = kernel_concat_ptr->SetKernelArguments(1, &inputX, sizeof(cl_mem));
     if (!result) {
       break;
     }
 
-    result = ConcatLayerCl::kernel_concat_axis1.SetKernelArguments(
-      2, &inOutY, sizeof(cl_mem));
+    result = kernel_concat_ptr->SetKernelArguments(2, &inOutY, sizeof(cl_mem));
     if (!result) {
       break;
     }
 
-    result = ConcatLayerCl::kernel_concat_axis1.SetKernelArguments(
-      3, &input1_batch_size, sizeof(int));
+    result =
+      kernel_concat_ptr->SetKernelArguments(3, &input1_batch_size, sizeof(int));
     if (!result) {
       break;
     }
 
-    result = ConcatLayerCl::kernel_concat_axis1.SetKernelArguments(
-      4, &input1_channels, sizeof(int));
+    result =
+      kernel_concat_ptr->SetKernelArguments(4, &input1_channels, sizeof(int));
     if (!result) {
       break;
     }
 
-    result = ConcatLayerCl::kernel_concat_axis1.SetKernelArguments(
-      5, &input2_channels, sizeof(int));
+    result =
+      kernel_concat_ptr->SetKernelArguments(5, &input2_channels, sizeof(int));
     if (!result) {
       break;
     }
 
-    result = ConcatLayerCl::kernel_concat_axis1.SetKernelArguments(
-      6, &input1_height, sizeof(int));
+    result =
+      kernel_concat_ptr->SetKernelArguments(6, &input1_height, sizeof(int));
     if (!result) {
       break;
     }
 
-    result = ConcatLayerCl::kernel_concat_axis1.SetKernelArguments(
-      7, &input1_width, sizeof(int));
+    result =
+      kernel_concat_ptr->SetKernelArguments(7, &input1_width, sizeof(int));
     if (!result) {
       break;
     }
@@ -922,13 +911,13 @@ void ConcatLayerCl::concat_cl_axis1(
     const int work_groups_count[3] = {dim, 1, 1};
     const int work_group_size[3] = {32, 32, 1}; // test-value
 
-    result = context.command_queue_inst_.DispatchCommand(
-      ConcatLayerCl::kernel_concat_axis1, work_groups_count, work_group_size);
+    result = cl_context_ref.command_queue_inst_.DispatchCommand(
+      kernel_concat_ptr, work_groups_count, work_group_size);
     if (!result) {
       break;
     }
 
-    result = inOutY.ReadData(context.command_queue_inst_, vecYdata);
+    result = inOutY.ReadData(cl_context_ref.command_queue_inst_, vecYdata);
     if (!result) {
       break;
     }
@@ -940,95 +929,92 @@ void ConcatLayerCl::concat_cl_axis1_fp16(
   const __fp16 *matAdata, const __fp16 *vecXdata, __fp16 *vecYdata,
   unsigned int input1_batch_size, unsigned int input1_height,
   unsigned int input1_width, unsigned int input1_channels,
-  unsigned int input2_channels, RunLayerContext &context) {
+  unsigned int input2_channels) {
 
   bool result = false;
 
   do {
-    result = context.clCreateKernel(concat_cl_axis1_kernel_fp16_,
-                                    context.LayerKernel::CONCAT_AXIS1_FP16,
-                                    ConcatLayerCl::kernel_concat_axis1_fp16);
-    if (!result) {
+    ClContext::SharedPtrClKernel kernel_concat_ptr =
+      cl_context_ref.registerClKernel(concat_cl_axis1_kernel_fp16_,
+                                      "concat_cl_axis1_fp16");
+    if (!kernel_concat_ptr) {
       break;
     }
 
     int dim = int(input1_batch_size * input1_width * input1_height *
                   (input1_channels + input2_channels));
 
-    opencl::Buffer inputA(context.context_inst_,
+    opencl::Buffer inputA(cl_context_ref.context_inst_,
                           sizeof(__fp16) * input1_batch_size * input1_channels *
                             input1_height * input1_width,
                           true, nullptr);
 
-    opencl::Buffer inputX(context.context_inst_,
+    opencl::Buffer inputX(cl_context_ref.context_inst_,
                           sizeof(__fp16) * input1_batch_size * input2_channels *
                             input1_height * input1_width,
                           true, nullptr);
 
-    opencl::Buffer inOutY(context.context_inst_,
+    opencl::Buffer inOutY(cl_context_ref.context_inst_,
                           sizeof(__fp16) * input1_batch_size * input1_width *
                             input1_height * (input1_channels + input2_channels),
                           true, nullptr);
 
-    result = inputA.WriteData(context.command_queue_inst_, matAdata);
+    result = inputA.WriteData(cl_context_ref.command_queue_inst_, matAdata);
     if (!result) {
       break;
     }
 
-    result = inputX.WriteData(context.command_queue_inst_, vecXdata);
+    result = inputX.WriteData(cl_context_ref.command_queue_inst_, vecXdata);
     if (!result) {
       break;
     }
 
-    result = inOutY.WriteData(context.command_queue_inst_, vecYdata);
+    result = inOutY.WriteData(cl_context_ref.command_queue_inst_, vecYdata);
     if (!result) {
       break;
     }
 
-    result = ConcatLayerCl::kernel_concat_axis1_fp16.SetKernelArguments(
-      0, &inputA, sizeof(cl_mem));
+    result = kernel_concat_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem));
     if (!result) {
       break;
     }
 
-    result = ConcatLayerCl::kernel_concat_axis1_fp16.SetKernelArguments(
-      1, &inputX, sizeof(cl_mem));
+    result = kernel_concat_ptr->SetKernelArguments(1, &inputX, sizeof(cl_mem));
     if (!result) {
       break;
     }
 
-    result = ConcatLayerCl::kernel_concat_axis1_fp16.SetKernelArguments(
-      2, &inOutY, sizeof(cl_mem));
+    result = kernel_concat_ptr->SetKernelArguments(2, &inOutY, sizeof(cl_mem));
     if (!result) {
       break;
     }
 
-    result = ConcatLayerCl::kernel_concat_axis1_fp16.SetKernelArguments(
-      3, &input1_batch_size, sizeof(int));
+    result =
+      kernel_concat_ptr->SetKernelArguments(3, &input1_batch_size, sizeof(int));
     if (!result) {
       break;
     }
 
-    result = ConcatLayerCl::kernel_concat_axis1_fp16.SetKernelArguments(
-      4, &input1_channels, sizeof(int));
+    result =
+      kernel_concat_ptr->SetKernelArguments(4, &input1_channels, sizeof(int));
     if (!result) {
       break;
     }
 
-    result = ConcatLayerCl::kernel_concat_axis1_fp16.SetKernelArguments(
-      5, &input2_channels, sizeof(int));
+    result =
+      kernel_concat_ptr->SetKernelArguments(5, &input2_channels, sizeof(int));
     if (!result) {
       break;
     }
 
-    result = ConcatLayerCl::kernel_concat_axis1_fp16.SetKernelArguments(
-      6, &input1_height, sizeof(int));
+    result =
+      kernel_concat_ptr->SetKernelArguments(6, &input1_height, sizeof(int));
     if (!result) {
       break;
     }
 
-    result = ConcatLayerCl::kernel_concat_axis1_fp16.SetKernelArguments(
-      7, &input1_width, sizeof(int));
+    result =
+      kernel_concat_ptr->SetKernelArguments(7, &input1_width, sizeof(int));
     if (!result) {
       break;
     }
@@ -1036,14 +1022,13 @@ void ConcatLayerCl::concat_cl_axis1_fp16(
     const int work_groups_count[3] = {dim, 1, 1};
     const int work_group_size[3] = {32, 32, 1}; // test-value
 
-    result = context.command_queue_inst_.DispatchCommand(
-      ConcatLayerCl::kernel_concat_axis1_fp16, work_groups_count,
-      work_group_size);
+    result = cl_context_ref.command_queue_inst_.DispatchCommand(
+      kernel_concat_ptr, work_groups_count, work_group_size);
     if (!result) {
       break;
     }
 
-    result = inOutY.ReadData(context.command_queue_inst_, vecYdata);
+    result = inOutY.ReadData(cl_context_ref.command_queue_inst_, vecYdata);
     if (!result) {
       break;
     }
index f0933d3b5ddd396d4350767de51ab36730bcc5df..b51e276930030a8537e35a184f6683570e1fd94c 100644 (file)
@@ -15,6 +15,7 @@
 #define __CONCAT_LAYER_CL_H__
 #ifdef __cplusplus
 
+#include <cl_context.h>
 #include <common_properties.h>
 #include <layer_context.h>
 #include <layer_devel.h>
@@ -31,6 +32,10 @@ namespace nntrainer {
  * @brief   Concat Layer
  */
 class ConcatLayerCl : public Layer {
+
+private:
+  inline static ClContext cl_context_ref;
+
 public:
   /**
    * @brief     Constructor of Concat Layer
@@ -113,10 +118,8 @@ public:
    * @param[in] input1 Tensor
    * @param[in] input2 Tensor
    * @param[in] result Tensor
-   * @param[in] RunLayerContext reference
    */
-  void ConcatProcess(Tensor const &in1, Tensor const &in2, Tensor &result,
-                     RunLayerContext &context);
+  void ConcatProcess(Tensor const &in1, Tensor const &in2, Tensor &result);
 
   /**
    * @brief     concat computation for axis 3
@@ -129,14 +132,48 @@ public:
    * @param[in] input1_height   represents the height of the input tensor
    * @param[in] input1_width   represents the width of the input tensor A
    * @param[in] input2_width   represents the width of the input tensor X
-   * @param[in] context RunLayerContext reference
    */
   void concat_cl_axis3(const float *matAdata, const float *vecXdata,
                        float *vecYdata, unsigned int input1_batch_size,
                        unsigned int input1_channels, unsigned int input1_height,
-                       unsigned int input1_width, unsigned int input2_width,
-                       RunLayerContext &context);
+                       unsigned int input1_width, unsigned int input2_width);
+
+  /**
+   * @brief     concat computation for axis 2
+   * @param[in] matAdata float * for Input Tensor A
+   * @param[in] vecXdata float * for Input Tensor X
+   * @param[in] vecYdata float * for Output Tensor Y
+   * @param[in] input1_batch_size  represents the number of samples in the input
+   * tensor
+   * @param[in] input1_channels   represents the channels of the input tensor
+   * @param[in] input1_width   represents the width of the input tensor
+   * @param[in] input1_height   represents the height of the input tensor A
+   * @param[in] input2_height   represents the height of the input tensor X
+   */
+  void concat_cl_axis2(const float *matAdata, const float *vecXdata,
+                       float *vecYdata, unsigned int input1_batch_size,
+                       unsigned int input1_channels, unsigned int input1_width,
+                       unsigned int input1_height, unsigned int input2_height);
+
+  /**
+   * @brief     concat computation for axis 1
+   * @param[in] matAdata float * for Input Tensor A
+   * @param[in] vecXdata float * for Input Tensor X
+   * @param[in] vecYdata float * for Output Tensor Y
+   * @param[in] input1_batch_size  represents the number of samples in the input
+   * tensor
+   * @param[in] input1_height   represents the height of the input tensor
+   * @param[in] input1_width   represents the width of the input tensor
+   * @param[in] input1_channels   represents the channels of the input tensor A
+   * @param[in] input2_channels   represents the channels of the input tensor X
+   */
+  void concat_cl_axis1(const float *matAdata, const float *vecXdata,
+                       float *vecYdata, unsigned int input1_batch_size,
+                       unsigned int input1_height, unsigned int input1_width,
+                       unsigned int input1_channels,
+                       unsigned int input2_channels);
 
+#ifdef ENABLE_FP16
   /**
    * @brief     concat computation for axis 3 fp16
    * @param[in] matAdata fp16 * for Input Tensor A
@@ -148,34 +185,13 @@ public:
    * @param[in] input1_height   represents the height of the input tensor
    * @param[in] input1_width   represents the width of the input tensor A
    * @param[in] input2_width   represents the width of the input tensor X
-   * @param[in] context RunLayerContext reference
    */
   void concat_cl_axis3_fp16(const __fp16 *matAdata, const __fp16 *vecXdata,
                             __fp16 *vecYdata, unsigned int input1_batch_size,
                             unsigned int input1_channels,
                             unsigned int input1_height,
                             unsigned int input1_width,
-                            unsigned int input2_width,
-                            RunLayerContext &context);
-
-  /**
-   * @brief     concat computation for axis 2
-   * @param[in] matAdata float * for Input Tensor A
-   * @param[in] vecXdata float * for Input Tensor X
-   * @param[in] vecYdata float * for Output Tensor Y
-   * @param[in] input1_batch_size  represents the number of samples in the input
-   * tensor
-   * @param[in] input1_channels   represents the channels of the input tensor
-   * @param[in] input1_width   represents the width of the input tensor
-   * @param[in] input1_height   represents the height of the input tensor A
-   * @param[in] input2_height   represents the height of the input tensor X
-   * @param[in] context RunLayerContext reference
-   */
-  void concat_cl_axis2(const float *matAdata, const float *vecXdata,
-                       float *vecYdata, unsigned int input1_batch_size,
-                       unsigned int input1_channels, unsigned int input1_width,
-                       unsigned int input1_height, unsigned int input2_height,
-                       RunLayerContext &context);
+                            unsigned int input2_width);
 
   /**
    * @brief     concat computation for axis 2 fp16
@@ -188,34 +204,13 @@ public:
    * @param[in] input1_width   represents the width of the input tensor
    * @param[in] input1_height   represents the height of the input tensor A
    * @param[in] input2_height   represents the height of the input tensor X
-   * @param[in] context RunLayerContext reference
    */
   void concat_cl_axis2_fp16(const __fp16 *matAdata, const __fp16 *vecXdata,
                             __fp16 *vecYdata, unsigned int input1_batch_size,
                             unsigned int input1_channels,
                             unsigned int input1_width,
                             unsigned int input1_height,
-                            unsigned int input2_height,
-                            RunLayerContext &context);
-
-  /**
-   * @brief     concat computation for axis 1
-   * @param[in] matAdata float * for Input Tensor A
-   * @param[in] vecXdata float * for Input Tensor X
-   * @param[in] vecYdata float * for Output Tensor Y
-   * @param[in] input1_batch_size  represents the number of samples in the input
-   * tensor
-   * @param[in] input1_height   represents the height of the input tensor
-   * @param[in] input1_width   represents the width of the input tensor
-   * @param[in] input1_channels   represents the channels of the input tensor A
-   * @param[in] input2_channels   represents the channels of the input tensor X
-   * @param[in] context RunLayerContext reference
-   */
-  void concat_cl_axis1(const float *matAdata, const float *vecXdata,
-                       float *vecYdata, unsigned int input1_batch_size,
-                       unsigned int input1_height, unsigned int input1_width,
-                       unsigned int input1_channels,
-                       unsigned int input2_channels, RunLayerContext &context);
+                            unsigned int input2_height);
 
   /**
    * @brief     concat computation for axis 1 fp16
@@ -228,16 +223,14 @@ public:
    * @param[in] input1_width   represents the width of the input tensor
    * @param[in] input1_channels   represents the channels of the input tensor A
    * @param[in] input2_channels   represents the channels of the input tensor X
-   * @param[in] context RunLayerContext reference
    */
   void concat_cl_axis1_fp16(const __fp16 *matAdata, const __fp16 *vecXdata,
                             __fp16 *vecYdata, unsigned int input1_batch_size,
                             unsigned int input1_height,
                             unsigned int input1_width,
                             unsigned int input1_channels,
-                            unsigned int input2_channels,
-                            RunLayerContext &context);
-
+                            unsigned int input2_channelst);
+#endif
 private:
   std::tuple<props::ConcatDimension> concat_props;
 };
index f9d740f00c3f7a3b620369533e6518bdf2cec81d..c75328f69aa44e708c9636bd96136602f1c7c589 100644 (file)
@@ -1,10 +1,10 @@
 cl_layer_sources = [
     'fc_layer_cl.cpp',
   # 'addition_layer_cl.cpp',
-  # 'swiglu_cl.cpp',
-  # 'reshape_cl.cpp',
+   'swiglu_cl.cpp',
+   'reshape_cl.cpp',
   # 'rmsnorm_layer_cl.cpp',
-  # 'concat_cl.cpp',
+   'concat_cl.cpp',
 ]
 
 foreach s : cl_layer_sources
index 71ae466e7454f3b4d5b0e92ec918c884ca151898..7698966484cfd18694425f049ef912966e658fae 100644 (file)
@@ -78,7 +78,7 @@ void ReshapeLayerCl::forwarding(RunLayerContext &context, bool training) {
   if (!context.executeInPlace()) {
     Tensor &output = context.getOutput(SINGLE_INOUT_IDX);
     const Tensor &input = context.getInput(SINGLE_INOUT_IDX);
-    ReshapeProcess(input, output, context);
+    ReshapeProcess(input, output);
   }
 }
 
@@ -94,15 +94,14 @@ void ReshapeLayerCl::incremental_forwarding(RunLayerContext &context,
       from = 0;
       to = 1;
     }
-    ReshapeProcess(input, output, context);
+    ReshapeProcess(input, output);
   }
 }
 
 opencl::Kernel ReshapeLayerCl::kernel_copy;
 opencl::Kernel ReshapeLayerCl::kernel_copy_fp16;
 
-void ReshapeLayerCl::ReshapeProcess(Tensor const &input, Tensor &output,
-                                    RunLayerContext &context) {
+void ReshapeLayerCl::ReshapeProcess(Tensor const &input, Tensor &output) {
 
   unsigned int input_batch_size, input_height, input_width, input_channels;
 
@@ -115,84 +114,81 @@ void ReshapeLayerCl::ReshapeProcess(Tensor const &input, Tensor &output,
     const float *data = input.getData();
     float *rdata = output.getData();
     copy_cl(data, rdata, input_batch_size, input_channels, input_height,
-            input_width, context);
+            input_width);
   } else if (input.getDataType() == ml::train::TensorDim::DataType::FP16) {
 #ifdef ENABLE_FP16
     const _FP16 *data = input.getData<_FP16>();
     _FP16 *rdata = output.getData<_FP16>();
     copy_cl_fp16(data, rdata, input_batch_size, input_channels, input_height,
-                 input_width, context);
+                 input_width);
 #else
     throw std::invalid_argument("Error: enable-fp16 is not enabled");
 #endif
   }
 }
 
-void ReshapeLayerCl::copy_cl(const float *input, float *res,
-                             unsigned int input_batch_size,
-                             unsigned int input_channels,
-                             unsigned int input_height,
-                             unsigned int input_width,
-                             RunLayerContext &context) {
+void ReshapeLayerCl::copy_cl_fp16(const __fp16 *input, __fp16 *res,
+                                  unsigned int input_batch_size,
+                                  unsigned int input_channels,
+                                  unsigned int input_height,
+                                  unsigned int input_width) {
 
   bool result = false;
 
   do {
-    result = context.clCreateKernel(copy_cl_kernel_, context.LayerKernel::COPY,
-                                    ReshapeLayerCl::kernel_copy);
-    if (!result) {
+    ClContext::SharedPtrClKernel kernel_copy_ptr =
+      cl_context_ref.registerClKernel(copy_cl_kernel_fp16_, "copy_cl_fp16");
+    if (!kernel_copy_ptr) {
       break;
     }
 
-    size_t dim_size = sizeof(float) * input_batch_size * input_height *
+    size_t dim_size = sizeof(__fp16) * input_batch_size * input_height *
                       input_width * input_channels;
 
-    opencl::Buffer inputA(context.context_inst_, dim_size, true, nullptr);
+    opencl::Buffer inputA(cl_context_ref.context_inst_, dim_size, true,
+                          nullptr);
 
-    opencl::Buffer inOutRes(context.context_inst_, dim_size, true, nullptr);
+    opencl::Buffer inOutRes(cl_context_ref.context_inst_, dim_size, true,
+                            nullptr);
 
-    result = inputA.WriteData(context.command_queue_inst_, input);
+    result = inputA.WriteData(cl_context_ref.command_queue_inst_, input);
     if (!result) {
       break;
     }
 
-    result = inOutRes.WriteData(context.command_queue_inst_, res);
+    result = inOutRes.WriteData(cl_context_ref.command_queue_inst_, res);
     if (!result) {
       break;
     }
 
-    result = ReshapeLayerCl::kernel_copy.SetKernelArguments(0, &inputA,
-                                                            sizeof(cl_mem));
+    result = kernel_copy_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem));
     if (!result) {
       break;
     }
 
-    result = ReshapeLayerCl::kernel_copy.SetKernelArguments(1, &inOutRes,
-                                                            sizeof(cl_mem));
+    result = kernel_copy_ptr->SetKernelArguments(1, &inOutRes, sizeof(cl_mem));
     if (!result) {
       break;
     }
 
-    result = ReshapeLayerCl::kernel_copy.SetKernelArguments(
-      2, &input_batch_size, sizeof(int));
+    result =
+      kernel_copy_ptr->SetKernelArguments(2, &input_batch_size, sizeof(int));
     if (!result) {
       break;
     }
 
-    result = ReshapeLayerCl::kernel_copy.SetKernelArguments(3, &input_channels,
-                                                            sizeof(int));
+    result =
+      kernel_copy_ptr->SetKernelArguments(3, &input_channels, sizeof(int));
     if (!result) {
       break;
     }
 
-    result = ReshapeLayerCl::kernel_copy.SetKernelArguments(4, &input_height,
-                                                            sizeof(int));
+    result = kernel_copy_ptr->SetKernelArguments(4, &input_height, sizeof(int));
     if (!result) {
       break;
     }
 
-    result = ReshapeLayerCl::kernel_copy.SetKernelArguments(5, &input_width,
-                                                            sizeof(int));
+    result = kernel_copy_ptr->SetKernelArguments(5, &input_width, sizeof(int));
     if (!result) {
       break;
     }
@@ -200,13 +196,13 @@ void ReshapeLayerCl::copy_cl(const float *input, float *res,
     const int work_groups_count[3] = {(int)dim_size, 1, 1};
     const int work_group_size[3] = {32, 32, 1}; // test-value
 
-    result = context.command_queue_inst_.DispatchCommand(
-      ReshapeLayerCl::kernel_copy, work_groups_count, work_group_size);
+    result = cl_context_ref.command_queue_inst_.DispatchCommand(
+      kernel_copy_ptr, work_groups_count, work_group_size);
     if (!result) {
       break;
     }
 
-    result = inOutRes.ReadData(context.command_queue_inst_, res);
+    result = inOutRes.ReadData(cl_context_ref.command_queue_inst_, res);
     if (!result) {
       break;
     }
@@ -214,72 +210,68 @@ void ReshapeLayerCl::copy_cl(const float *input, float *res,
   } while (false);
 }
 
-void ReshapeLayerCl::copy_cl_fp16(const __fp16 *input, __fp16 *res,
-                                  unsigned int input_batch_size,
-                                  unsigned int input_channels,
-                                  unsigned int input_height,
-                                  unsigned int input_width,
-                                  RunLayerContext &context) {
+void ReshapeLayerCl::copy_cl(const float *input, float *res,
+                             unsigned int input_batch_size,
+                             unsigned int input_channels,
+                             unsigned int input_height,
+                             unsigned int input_width) {
 
   bool result = false;
 
   do {
-    result = context.clCreateKernel(copy_cl_kernel_fp16_,
-                                    context.LayerKernel::COPY_FP16,
-                                    ReshapeLayerCl::kernel_copy_fp16);
-    if (!result) {
+    ClContext::SharedPtrClKernel kernel_copy_ptr =
+      cl_context_ref.registerClKernel(copy_cl_kernel_, "copy_cl");
+    if (!kernel_copy_ptr) {
       break;
     }
 
-    size_t dim_size = sizeof(__fp16) * input_batch_size * input_height *
+    size_t dim_size = sizeof(float) * input_batch_size * input_height *
                       input_width * input_channels;
 
-    opencl::Buffer inputA(context.context_inst_, dim_size, true, nullptr);
+    opencl::Buffer inputA(cl_context_ref.context_inst_, dim_size, true,
+                          nullptr);
 
-    opencl::Buffer inOutRes(context.context_inst_, dim_size, true, nullptr);
+    opencl::Buffer inOutRes(cl_context_ref.context_inst_, dim_size, true,
+                            nullptr);
 
-    result = inputA.WriteData(context.command_queue_inst_, input);
+    result = inputA.WriteData(cl_context_ref.command_queue_inst_, input);
     if (!result) {
       break;
     }
 
-    result = inOutRes.WriteData(context.command_queue_inst_, res);
+    result = inOutRes.WriteData(cl_context_ref.command_queue_inst_, res);
     if (!result) {
       break;
     }
 
-    result = ReshapeLayerCl::kernel_copy_fp16.SetKernelArguments(
-      0, &inputA, sizeof(cl_mem));
+    result = kernel_copy_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem));
     if (!result) {
       break;
     }
 
-    result = ReshapeLayerCl::kernel_copy_fp16.SetKernelArguments(
-      1, &inOutRes, sizeof(cl_mem));
+    result = kernel_copy_ptr->SetKernelArguments(1, &inOutRes, sizeof(cl_mem));
     if (!result) {
       break;
     }
 
-    result = ReshapeLayerCl::kernel_copy_fp16.SetKernelArguments(
-      2, &input_batch_size, sizeof(int));
+    result =
+      kernel_copy_ptr->SetKernelArguments(2, &input_batch_size, sizeof(int));
     if (!result) {
       break;
     }
 
-    result = ReshapeLayerCl::kernel_copy_fp16.SetKernelArguments(
-      3, &input_channels, sizeof(int));
+    result =
+      kernel_copy_ptr->SetKernelArguments(3, &input_channels, sizeof(int));
     if (!result) {
       break;
     }
 
-    result = ReshapeLayerCl::kernel_copy_fp16.SetKernelArguments(
-      4, &input_height, sizeof(int));
+    result = kernel_copy_ptr->SetKernelArguments(4, &input_height, sizeof(int));
     if (!result) {
       break;
     }
 
-    result = ReshapeLayerCl::kernel_copy_fp16.SetKernelArguments(
-      5, &input_width, sizeof(int));
+    result = kernel_copy_ptr->SetKernelArguments(5, &input_width, sizeof(int));
     if (!result) {
       break;
     }
@@ -287,13 +279,13 @@ void ReshapeLayerCl::copy_cl_fp16(const __fp16 *input, __fp16 *res,
     const int work_groups_count[3] = {(int)dim_size, 1, 1};
     const int work_group_size[3] = {32, 32, 1}; // test-value
 
-    result = context.command_queue_inst_.DispatchCommand(
-      ReshapeLayerCl::kernel_copy_fp16, work_groups_count, work_group_size);
+    result = cl_context_ref.command_queue_inst_.DispatchCommand(
+      kernel_copy_ptr, work_groups_count, work_group_size);
     if (!result) {
       break;
     }
 
-    result = inOutRes.ReadData(context.command_queue_inst_, res);
+    result = inOutRes.ReadData(cl_context_ref.command_queue_inst_, res);
     if (!result) {
       break;
     }
index 7715d8785d18a6ccd76683c950db38ac06b6da5e..3d19a0e0b609a8ba5820c21cf7d5cd34a4bffce2 100644 (file)
@@ -15,6 +15,7 @@
 #define __RESHAPE_LAYER_CL_H__
 #ifdef __cplusplus
 
+#include <cl_context.h>
 #include <common_properties.h>
 #include <layer_devel.h>
 #include <opencl_buffer.h>
@@ -26,6 +27,10 @@ namespace nntrainer {
  * @brief   Reshape Layer
  */
 class ReshapeLayerCl : public Layer {
+
+private:
+  inline static ClContext cl_context_ref;
+
 public:
   /**
    * @brief     Constructor of Reshape Layer
@@ -107,10 +112,8 @@ public:
    * @brief Process data and dimensions for reshape operation
    * @param[in] input Tensor
    * @param[in] result Tensor
-   * @param[in] context RunLayerContext reference
    */
-  void ReshapeProcess(Tensor const &input, Tensor &result,
-                      RunLayerContext &context);
+  void ReshapeProcess(Tensor const &input, Tensor &result);
 
   /**
    * @brief     copy computation
@@ -121,12 +124,12 @@ public:
    * @param[in] input_channels   represents the channels of the input tensor
    * @param[in] input_height   represents the height of the input tensor
    * @param[in] input_width   represents the width of the input tensor
-   * @param[in] context RunLayerContext reference
    */
   void copy_cl(const float *input, float *res, unsigned int input_batch_size,
                unsigned int input_channels, unsigned int input_height,
-               unsigned int input_width, RunLayerContext &context);
+               unsigned int input_width);
 
+#ifdef ENABLE_FP16
   /**
    * @brief     copy computation
    * @param[in] input fp16 * for Input Tensor
@@ -136,12 +139,11 @@ public:
    * @param[in] input_channels   represents the channels of the input tensor
    * @param[in] input_height   represents the height of the input tensor
    * @param[in] input_width   represents the width of the input tensor
-   * @param[in] context RunLayerContext reference
    */
   void copy_cl_fp16(const __fp16 *input, __fp16 *res,
                     unsigned int input_batch_size, unsigned int input_channels,
-                    unsigned int input_height, unsigned int input_width,
-                    RunLayerContext &context);
+                    unsigned int input_height, unsigned int input_width);
+#endif
 
 protected:
   std::tuple<props::TargetShape>
index ed4e65bb5e216524c74a5dc4e5b359387bdeb5c3..9545237a79210becab84cc09277973c06f8bbe3f 100644 (file)
@@ -43,7 +43,7 @@ void SwiGLULayerCl::forwarding(RunLayerContext &context, bool training) {
   Tensor &in1 = context.getInput(INPUT_IDX_1);
   Tensor &in2 = context.getInput(INPUT_IDX_2);
   Tensor &out = context.getOutput(OUT_IDX);
-  swigluProcess(in1, in2, out, context);
+  swigluProcess(in1, in2, out);
 }
 
 void SwiGLULayerCl::incremental_forwarding(RunLayerContext &context,
@@ -60,14 +60,14 @@ void SwiGLULayerCl::incremental_forwarding(RunLayerContext &context,
     to = 1;
   }
 
-  swigluProcess(in1, in2, out, context);
+  swigluProcess(in1, in2, out);
 }
 
 opencl::Kernel SwiGLULayerCl::kernel_swiglu;
 opencl::Kernel SwiGLULayerCl::kernel_swiglu_fp16;
 
 void SwiGLULayerCl::swigluProcess(Tensor const &in1, Tensor const &in2,
-                                  Tensor &result, RunLayerContext &context) {
+                                  Tensor &result) {
 
   unsigned int dim1, dim2;
   dim1 = in1.batch() * in1.channel() * in1.height();
@@ -77,13 +77,13 @@ void SwiGLULayerCl::swigluProcess(Tensor const &in1, Tensor const &in2,
     const float *data1 = in1.getData();
     const float *data2 = in2.getData();
     float *rdata = result.getData();
-    swiglu_cl(data1, data2, rdata, dim1, dim2, context);
+    swiglu_cl(data1, data2, rdata, dim1, dim2);
   } else if (in1.getDataType() == ml::train::TensorDim::DataType::FP16) {
 #ifdef ENABLE_FP16
     const _FP16 *data1 = in1.getData<_FP16>();
     const _FP16 *data2 = in2.getData<_FP16>();
     _FP16 *rdata = result.getData<_FP16>();
-    swiglu_cl_fp16(data1, data2, rdata, dim1, dim2, context);
+    swiglu_cl_fp16(data1, data2, rdata, dim1, dim2);
 #else
     throw std::invalid_argument("Error: enable-fp16 is not enabled");
 #endif
@@ -92,57 +92,53 @@ void SwiGLULayerCl::swigluProcess(Tensor const &in1, Tensor const &in2,
 
 void SwiGLULayerCl::swiglu_cl(const float *matAdata, const float *vecXdata,
                               float *vecYdata, unsigned int dim1,
-                              unsigned int dim2, RunLayerContext &context) {
+                              unsigned int dim2) {
 
   bool result = false;
 
   do {
-    result =
-      context.clCreateKernel(swiglu_cl_kernel_, context.LayerKernel::SWIGLU,
-                             SwiGLULayerCl::kernel_swiglu);
-    if (!result) {
+    ClContext::SharedPtrClKernel kernel_swiglu_ptr =
+      cl_context_ref.registerClKernel(swiglu_cl_kernel_, "swiglu_cl");
+    if (!kernel_swiglu_ptr) {
       break;
     }
 
     int dim = int(dim1 * dim2);
-    opencl::Buffer inputA(context.context_inst_, sizeof(float) * dim1 * dim2, true,
-                          nullptr);
+    opencl::Buffer inputA(cl_context_ref.context_inst_,
+                          sizeof(float) * dim1 * dim2, true, nullptr);
 
-    opencl::Buffer inputX(context.context_inst_, sizeof(float) * dim1 * dim2, true,
-                          nullptr);
+    opencl::Buffer inputX(cl_context_ref.context_inst_,
+                          sizeof(float) * dim1 * dim2, true, nullptr);
 
-    opencl::Buffer inOutY(context.context_inst_, sizeof(float) * dim1 * dim2, true,
-                          nullptr);
+    opencl::Buffer inOutY(cl_context_ref.context_inst_,
+                          sizeof(float) * dim1 * dim2, true, nullptr);
 
-    result = inputA.WriteData(context.command_queue_inst_, matAdata);
+    result = inputA.WriteData(cl_context_ref.command_queue_inst_, matAdata);
     if (!result) {
       break;
     }
 
-    result = inputX.WriteData(context.command_queue_inst_, vecXdata);
+    result = inputX.WriteData(cl_context_ref.command_queue_inst_, vecXdata);
     if (!result) {
       break;
     }
 
-    result = inOutY.WriteData(context.command_queue_inst_, vecYdata);
+    result = inOutY.WriteData(cl_context_ref.command_queue_inst_, vecYdata);
     if (!result) {
       break;
     }
 
-    result = SwiGLULayerCl::kernel_swiglu.SetKernelArguments(0, &inputA,
-                                                             sizeof(cl_mem));
+    result = kernel_swiglu_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem));
     if (!result) {
       break;
     }
 
-    result = SwiGLULayerCl::kernel_swiglu.SetKernelArguments(1, &inputX,
-                                                             sizeof(cl_mem));
+    result = kernel_swiglu_ptr->SetKernelArguments(1, &inputX, sizeof(cl_mem));
     if (!result) {
       break;
     }
 
-    result = SwiGLULayerCl::kernel_swiglu.SetKernelArguments(2, &inOutY,
-                                                             sizeof(cl_mem));
+    result = kernel_swiglu_ptr->SetKernelArguments(2, &inOutY, sizeof(cl_mem));
     if (!result) {
       break;
     }
@@ -150,13 +146,13 @@ void SwiGLULayerCl::swiglu_cl(const float *matAdata, const float *vecXdata,
     const int work_groups_count[3] = {dim, 1, 1};
     const int work_group_size[3] = {32, 32, 1}; // test-value
 
-    result = context.command_queue_inst_.DispatchCommand(
-      SwiGLULayerCl::kernel_swiglu, work_groups_count, work_group_size);
+    result = cl_context_ref.command_queue_inst_.DispatchCommand(
+      kernel_swiglu_ptr, work_groups_count, work_group_size);
     if (!result) {
       break;
     }
 
-    result = inOutY.ReadData(context.command_queue_inst_, vecYdata);
+    result = inOutY.ReadData(cl_context_ref.command_queue_inst_, vecYdata);
     if (!result) {
       break;
     }
@@ -166,58 +162,53 @@ void SwiGLULayerCl::swiglu_cl(const float *matAdata, const float *vecXdata,
 
 void SwiGLULayerCl::swiglu_cl_fp16(const __fp16 *matAdata,
                                    const __fp16 *vecXdata, __fp16 *vecYdata,
-                                   unsigned int dim1, unsigned int dim2,
-                                   RunLayerContext &context) {
+                                   unsigned int dim1, unsigned int dim2) {
 
   bool result = false;
 
   do {
-    result = context.clCreateKernel(swiglu_cl_kernel_fp16_,
-                                    context.LayerKernel::SWIGLU_FP16,
-                                    SwiGLULayerCl::kernel_swiglu_fp16);
-    if (!result) {
+    ClContext::SharedPtrClKernel kernel_swiglu_ptr =
+      cl_context_ref.registerClKernel(swiglu_cl_kernel_fp16_, "swiglu_cl_fp16");
+    if (!kernel_swiglu_ptr) {
       break;
     }
-    
+
     int dim = int(dim1 * dim2);
-    opencl::Buffer inputA(context.context_inst_, sizeof(__fp16) * dim1 * dim2, true,
-                          nullptr);
+    opencl::Buffer inputA(cl_context_ref.context_inst_,
+                          sizeof(__fp16) * dim1 * dim2, true, nullptr);
 
-    opencl::Buffer inputX(context.context_inst_, sizeof(__fp16) * dim1 * dim2, true,
-                          nullptr);
+    opencl::Buffer inputX(cl_context_ref.context_inst_,
+                          sizeof(__fp16) * dim1 * dim2, true, nullptr);
 
-    opencl::Buffer inOutY(context.context_inst_, sizeof(__fp16) * dim1 * dim2, true,
-                          nullptr);
+    opencl::Buffer inOutY(cl_context_ref.context_inst_,
+                          sizeof(__fp16) * dim1 * dim2, true, nullptr);
 
-    result = inputA.WriteData(context.command_queue_inst_, matAdata);
+    result = inputA.WriteData(cl_context_ref.command_queue_inst_, matAdata);
     if (!result) {
       break;
     }
 
-    result = inputX.WriteData(context.command_queue_inst_, vecXdata);
+    result = inputX.WriteData(cl_context_ref.command_queue_inst_, vecXdata);
     if (!result) {
       break;
     }
 
-    result = inOutY.WriteData(context.command_queue_inst_, vecYdata);
+    result = inOutY.WriteData(cl_context_ref.command_queue_inst_, vecYdata);
     if (!result) {
       break;
     }
 
-    result = SwiGLULayerCl::kernel_swiglu_fp16.SetKernelArguments(
-      0, &inputA, sizeof(cl_mem));
+    result = kernel_swiglu_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem));
     if (!result) {
       break;
     }
 
-    result = SwiGLULayerCl::kernel_swiglu_fp16.SetKernelArguments(
-      1, &inputX, sizeof(cl_mem));
+    result = kernel_swiglu_ptr->SetKernelArguments(1, &inputX, sizeof(cl_mem));
     if (!result) {
       break;
     }
 
-    result = SwiGLULayerCl::kernel_swiglu_fp16.SetKernelArguments(
-      2, &inOutY, sizeof(cl_mem));
+    result = kernel_swiglu_ptr->SetKernelArguments(2, &inOutY, sizeof(cl_mem));
     if (!result) {
       break;
     }
@@ -225,13 +216,13 @@ void SwiGLULayerCl::swiglu_cl_fp16(const __fp16 *matAdata,
     const int work_groups_count[3] = {dim, 1, 1};
     const int work_group_size[3] = {32, 32, 1}; // test-value
 
-    result = context.command_queue_inst_.DispatchCommand(
-      SwiGLULayerCl::kernel_swiglu_fp16, work_groups_count, work_group_size);
+    result = cl_context_ref.command_queue_inst_.DispatchCommand(
+      kernel_swiglu_ptr, work_groups_count, work_group_size);
     if (!result) {
       break;
     }
 
-    result = inOutY.ReadData(context.command_queue_inst_, vecYdata);
+    result = inOutY.ReadData(cl_context_ref.command_queue_inst_, vecYdata);
     if (!result) {
       break;
     }
@@ -259,9 +250,7 @@ Layer *create_swiglu_layer_cl() {
   return layer;
 }
 
-void destroy_swiglu_layer_cl(Layer *layer) {
-  delete layer;
-}
+void destroy_swiglu_layer_cl(Layer *layer) { delete layer; }
 
 extern "C" {
 LayerPluggable ml_train_layer_pluggable{create_swiglu_layer_cl,
index a96211ee3ee40e808210d286d6214066c0a6d610..bbb74dc77a431068b40787511061a1e06b619c60 100644 (file)
 #ifndef __SWIGLU_LAYER_CL_H__
 #define __SWIGLU_LAYER_CL_H__
 
+#include <cl_context.h>
+#include <common_properties.h>
 #include <layer_context.h>
 #include <layer_devel.h>
-#include <node_exporter.h>
-
-#include <common_properties.h>
 #include <layer_impl.h>
+#include <node_exporter.h>
 #include <opencl_buffer.h>
 #include <opencl_kernel.h>
 #include <utility>
@@ -31,6 +31,10 @@ namespace nntrainer {
  *
  */
 class SwiGLULayerCl final : public Layer {
+
+private:
+  inline static ClContext cl_context_ref;
+
 public:
   /**
    * @brief Construct a new SwiGLU layer object
@@ -100,10 +104,8 @@ public:
    * @param[in] input1 Tensor
    * @param[in] input2 Tensor
    * @param[in] result Tensor
-   * @param[in] RunLayerContext reference
    */
-  void swigluProcess(Tensor const &in1, Tensor const &in2, Tensor &result,
-                     RunLayerContext &context);
+  void swigluProcess(Tensor const &in1, Tensor const &in2, Tensor &result);
 
   /**
    * @brief     swiglu computation
@@ -112,12 +114,11 @@ public:
    * @param[in] vecYdata float * for Output Vector Y
    * @param[in] dim1 number of elements in input vector A
    * @param[in] dim1 number of elements in input vector X
-   * @param[in] context RunLayerContext reference
    */
   void swiglu_cl(const float *matAdata, const float *vecXdata, float *vecYdata,
-                 unsigned int dim1, unsigned int dim2,
-                 RunLayerContext &context);
+                 unsigned int dim1, unsigned int dim2);
 
+#ifdef ENABLE_FP16
   /**
    * @brief     fp16 swiglu computation
    * @param[in] matAdata fp16 * for Input Vector A
@@ -125,11 +126,10 @@ public:
    * @param[in] vecYdata fp16 * for Output Vector Y
    * @param[in] dim1 number of elements in input vector A
    * @param[in] dim1 number of elements in input vector X
-   * @param[in] context RunLayerContext reference
    */
   void swiglu_cl_fp16(const __fp16 *matAdata, const __fp16 *vecXdata,
-                      __fp16 *vecYdata, unsigned int dim1, unsigned int dim2,
-                      RunLayerContext &context);
+                      __fp16 *vecYdata, unsigned int dim1, unsigned int dim2);
+#endif
 };
 
 } // namespace nntrainer