Tensor &out = context.getOutput(SINGLE_INOUT_IDX);
const Tensor &in1 = context.getInput(INPUT_IDX_1);
const Tensor &in2 = context.getInput(INPUT_IDX_2);
- ConcatProcess(in1, in2, out, context);
+ ConcatProcess(in1, in2, out);
}
void ConcatLayerCl::incremental_forwarding(RunLayerContext &context,
from = 0;
to = 1;
}
- ConcatProcess(in1, in2, out, context);
+ ConcatProcess(in1, in2, out);
}
opencl::Kernel ConcatLayerCl::kernel_concat_axis3;
opencl::Kernel ConcatLayerCl::kernel_concat_axis1_fp16;
void ConcatLayerCl::ConcatProcess(Tensor const &in1, Tensor const &in2,
- Tensor &result, RunLayerContext &context) {
+ Tensor &result) {
unsigned int input1_batch_size, input1_height, input1_width, input1_channels,
input2_batch_size, input2_height, input2_width, input2_channels;
float *rdata = result.getData();
if (input1_width != input2_width) {
concat_cl_axis3(data1, data2, rdata, input1_batch_size, input1_channels,
- input1_height, input1_width, input2_width, context);
+ input1_height, input1_width, input2_width);
} else if (input1_height != input2_height) {
concat_cl_axis2(data1, data2, rdata, input1_batch_size, input1_channels,
- input1_width, input1_height, input2_height, context);
+ input1_width, input1_height, input2_height);
} else if (input1_channels != input2_channels) {
concat_cl_axis1(data1, data2, rdata, input1_batch_size, input1_height,
- input1_width, input1_channels, input2_channels, context);
+ input1_width, input1_channels, input2_channels);
}
} else if (in1.getDataType() == ml::train::TensorDim::DataType::FP16) {
#ifdef ENABLE_FP16
if (input1_width != input2_width) {
concat_cl_axis3_fp16(data1, data2, rdata, input1_batch_size,
input1_channels, input1_height, input1_width,
- input2_width, context);
+ input2_width);
} else if (input1_height != input2_height) {
concat_cl_axis2_fp16(data1, data2, rdata, input1_batch_size,
input1_channels, input1_width, input1_height,
- input2_height, context);
+ input2_height);
} else if (input1_channels != input2_channels) {
concat_cl_axis1_fp16(data1, data2, rdata, input1_batch_size,
input1_height, input1_width, input1_channels,
- input2_channels, context);
+ input2_channels);
}
#else
throw std::invalid_argument("Error: enable-fp16 is not enabled");
}
}
-void ConcatLayerCl::concat_cl_axis3(
- const float *matAdata, const float *vecXdata, float *vecYdata,
- unsigned int input1_batch_size, unsigned int input1_channels,
- unsigned int input1_height, unsigned int input1_width,
- unsigned int input2_width, RunLayerContext &context) {
+void ConcatLayerCl::concat_cl_axis3(const float *matAdata,
+ const float *vecXdata, float *vecYdata,
+ unsigned int input1_batch_size,
+ unsigned int input1_channels,
+ unsigned int input1_height,
+ unsigned int input1_width,
+ unsigned int input2_width) {
bool result = false;
do {
- result = context.clCreateKernel(concat_cl_axis3_kernel_,
- context.LayerKernel::CONCAT_AXIS3,
- ConcatLayerCl::kernel_concat_axis3);
- if (!result) {
+ ClContext::SharedPtrClKernel kernel_concat_ptr =
+ cl_context_ref.registerClKernel(concat_cl_axis3_kernel_,
+ "concat_cl_axis3");
+ if (!kernel_concat_ptr) {
break;
}
int dim = int(input1_batch_size * input1_channels * input1_height *
(input1_width + input2_width));
- opencl::Buffer inputA(context.context_inst_,
+ opencl::Buffer inputA(cl_context_ref.context_inst_,
sizeof(float) * input1_batch_size * input1_channels *
input1_height * input1_width,
true, nullptr);
- opencl::Buffer inputX(context.context_inst_,
+ opencl::Buffer inputX(cl_context_ref.context_inst_,
sizeof(float) * input1_batch_size * input1_channels *
input1_height * input2_width,
true, nullptr);
- opencl::Buffer inOutY(context.context_inst_,
+ opencl::Buffer inOutY(cl_context_ref.context_inst_,
sizeof(float) * input1_batch_size * input1_channels *
input1_height * (input1_width + input2_width),
true, nullptr);
- result = inputA.WriteData(context.command_queue_inst_, matAdata);
+ result = inputA.WriteData(cl_context_ref.command_queue_inst_, matAdata);
if (!result) {
break;
}
- result = inputX.WriteData(context.command_queue_inst_, vecXdata);
+ result = inputX.WriteData(cl_context_ref.command_queue_inst_, vecXdata);
if (!result) {
break;
}
- result = inOutY.WriteData(context.command_queue_inst_, vecYdata);
+ result = inOutY.WriteData(cl_context_ref.command_queue_inst_, vecYdata);
if (!result) {
break;
}
- result = ConcatLayerCl::kernel_concat_axis3.SetKernelArguments(
- 0, &inputA, sizeof(cl_mem));
+ result = kernel_concat_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem));
if (!result) {
break;
}
- result = ConcatLayerCl::kernel_concat_axis3.SetKernelArguments(
- 1, &inputX, sizeof(cl_mem));
+ result = kernel_concat_ptr->SetKernelArguments(1, &inputX, sizeof(cl_mem));
if (!result) {
break;
}
- result = ConcatLayerCl::kernel_concat_axis3.SetKernelArguments(
- 2, &inOutY, sizeof(cl_mem));
+ result = kernel_concat_ptr->SetKernelArguments(2, &inOutY, sizeof(cl_mem));
if (!result) {
break;
}
- result = ConcatLayerCl::kernel_concat_axis3.SetKernelArguments(
- 3, &input1_batch_size, sizeof(int));
+ result =
+ kernel_concat_ptr->SetKernelArguments(3, &input1_batch_size, sizeof(int));
if (!result) {
break;
}
- result = ConcatLayerCl::kernel_concat_axis3.SetKernelArguments(
- 4, &input1_channels, sizeof(int));
+ result =
+ kernel_concat_ptr->SetKernelArguments(4, &input1_channels, sizeof(int));
if (!result) {
break;
}
- result = ConcatLayerCl::kernel_concat_axis3.SetKernelArguments(
- 5, &input1_height, sizeof(int));
+ result =
+ kernel_concat_ptr->SetKernelArguments(5, &input1_height, sizeof(int));
if (!result) {
break;
}
- result = ConcatLayerCl::kernel_concat_axis3.SetKernelArguments(
- 6, &input1_width, sizeof(int));
+ result =
+ kernel_concat_ptr->SetKernelArguments(6, &input1_width, sizeof(int));
if (!result) {
break;
}
- result = ConcatLayerCl::kernel_concat_axis3.SetKernelArguments(
- 7, &input2_width, sizeof(int));
+ result =
+ kernel_concat_ptr->SetKernelArguments(7, &input2_width, sizeof(int));
if (!result) {
break;
}
const int work_groups_count[3] = {dim, 1, 1};
const int work_group_size[3] = {32, 32, 1}; // test-value
- result = context.command_queue_inst_.DispatchCommand(
- ConcatLayerCl::kernel_concat_axis3, work_groups_count, work_group_size);
+ result = cl_context_ref.command_queue_inst_.DispatchCommand(
+ kernel_concat_ptr, work_groups_count, work_group_size);
if (!result) {
break;
}
- result = inOutY.ReadData(context.command_queue_inst_, vecYdata);
+ result = inOutY.ReadData(cl_context_ref.command_queue_inst_, vecYdata);
if (!result) {
break;
}
const __fp16 *matAdata, const __fp16 *vecXdata, __fp16 *vecYdata,
unsigned int input1_batch_size, unsigned int input1_channels,
unsigned int input1_height, unsigned int input1_width,
- unsigned int input2_width, RunLayerContext &context) {
+ unsigned int input2_width) {
bool result = false;
do {
- result = context.clCreateKernel(concat_cl_axis3_kernel_fp16_,
- context.LayerKernel::CONCAT_AXIS3_FP16,
- ConcatLayerCl::kernel_concat_axis3_fp16);
- if (!result) {
+ ClContext::SharedPtrClKernel kernel_concat_ptr =
+ cl_context_ref.registerClKernel(concat_cl_axis3_kernel_fp16_,
+ "concat_cl_axis3_fp16");
+ if (!kernel_concat_ptr) {
break;
}
int dim = int(input1_batch_size * input1_channels * input1_height *
(input1_width + input2_width));
- opencl::Buffer inputA(context.context_inst_,
+ opencl::Buffer inputA(cl_context_ref.context_inst_,
sizeof(__fp16) * input1_batch_size * input1_channels *
input1_height * input1_width,
true, nullptr);
- opencl::Buffer inputX(context.context_inst_,
+ opencl::Buffer inputX(cl_context_ref.context_inst_,
sizeof(__fp16) * input1_batch_size * input1_channels *
input1_height * input2_width,
true, nullptr);
- opencl::Buffer inOutY(context.context_inst_,
+ opencl::Buffer inOutY(cl_context_ref.context_inst_,
sizeof(__fp16) * input1_batch_size * input1_channels *
input1_height * (input1_width + input2_width),
true, nullptr);
- result = inputA.WriteData(context.command_queue_inst_, matAdata);
+ result = inputA.WriteData(cl_context_ref.command_queue_inst_, matAdata);
if (!result) {
break;
}
- result = inputX.WriteData(context.command_queue_inst_, vecXdata);
+ result = inputX.WriteData(cl_context_ref.command_queue_inst_, vecXdata);
if (!result) {
break;
}
- result = inOutY.WriteData(context.command_queue_inst_, vecYdata);
+ result = inOutY.WriteData(cl_context_ref.command_queue_inst_, vecYdata);
if (!result) {
break;
}
- result = ConcatLayerCl::kernel_concat_axis3_fp16.SetKernelArguments(
- 0, &inputA, sizeof(cl_mem));
+ result = kernel_concat_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem));
if (!result) {
break;
}
- result = ConcatLayerCl::kernel_concat_axis3_fp16.SetKernelArguments(
- 1, &inputX, sizeof(cl_mem));
+ result = kernel_concat_ptr->SetKernelArguments(1, &inputX, sizeof(cl_mem));
if (!result) {
break;
}
- result = ConcatLayerCl::kernel_concat_axis3_fp16.SetKernelArguments(
- 2, &inOutY, sizeof(cl_mem));
+ result = kernel_concat_ptr->SetKernelArguments(2, &inOutY, sizeof(cl_mem));
if (!result) {
break;
}
- result = ConcatLayerCl::kernel_concat_axis3_fp16.SetKernelArguments(
- 3, &input1_batch_size, sizeof(int));
+ result =
+ kernel_concat_ptr->SetKernelArguments(3, &input1_batch_size, sizeof(int));
if (!result) {
break;
}
- result = ConcatLayerCl::kernel_concat_axis3_fp16.SetKernelArguments(
- 4, &input1_channels, sizeof(int));
+ result =
+ kernel_concat_ptr->SetKernelArguments(4, &input1_channels, sizeof(int));
if (!result) {
break;
}
- result = ConcatLayerCl::kernel_concat_axis3_fp16.SetKernelArguments(
- 5, &input1_height, sizeof(int));
+ result =
+ kernel_concat_ptr->SetKernelArguments(5, &input1_height, sizeof(int));
if (!result) {
break;
}
- result = ConcatLayerCl::kernel_concat_axis3_fp16.SetKernelArguments(
- 6, &input1_width, sizeof(int));
+ result =
+ kernel_concat_ptr->SetKernelArguments(6, &input1_width, sizeof(int));
if (!result) {
break;
}
- result = ConcatLayerCl::kernel_concat_axis3_fp16.SetKernelArguments(
- 7, &input2_width, sizeof(int));
+ result =
+ kernel_concat_ptr->SetKernelArguments(7, &input2_width, sizeof(int));
if (!result) {
break;
}
const int work_groups_count[3] = {dim, 1, 1};
const int work_group_size[3] = {32, 32, 1}; // test-value
- result = context.command_queue_inst_.DispatchCommand(
- ConcatLayerCl::kernel_concat_axis3_fp16, work_groups_count,
- work_group_size);
+ result = cl_context_ref.command_queue_inst_.DispatchCommand(
+ kernel_concat_ptr, work_groups_count, work_group_size);
if (!result) {
break;
}
- result = inOutY.ReadData(context.command_queue_inst_, vecYdata);
+ result = inOutY.ReadData(cl_context_ref.command_queue_inst_, vecYdata);
if (!result) {
break;
}
} while (false);
}
-void ConcatLayerCl::concat_cl_axis2(
- const float *matAdata, const float *vecXdata, float *vecYdata,
- unsigned int input1_batch_size, unsigned int input1_channels,
- unsigned int input1_width, unsigned int input1_height,
- unsigned int input2_height, RunLayerContext &context) {
+void ConcatLayerCl::concat_cl_axis2(const float *matAdata,
+ const float *vecXdata, float *vecYdata,
+ unsigned int input1_batch_size,
+ unsigned int input1_channels,
+ unsigned int input1_width,
+ unsigned int input1_height,
+ unsigned int input2_height) {
bool result = false;
do {
- result = context.clCreateKernel(concat_cl_axis2_kernel_,
- context.LayerKernel::CONCAT_AXIS2,
- ConcatLayerCl::kernel_concat_axis2);
- if (!result) {
+ ClContext::SharedPtrClKernel kernel_concat_ptr =
+ cl_context_ref.registerClKernel(concat_cl_axis2_kernel_,
+ "concat_cl_axis2");
+ if (!kernel_concat_ptr) {
break;
}
int dim = int(input1_batch_size * input1_channels * input1_width *
(input1_height + input2_height));
- opencl::Buffer inputA(context.context_inst_,
+ opencl::Buffer inputA(cl_context_ref.context_inst_,
sizeof(float) * input1_batch_size * input1_channels *
input1_height * input1_width,
true, nullptr);
- opencl::Buffer inputX(context.context_inst_,
+ opencl::Buffer inputX(cl_context_ref.context_inst_,
sizeof(float) * input1_batch_size * input1_channels *
input2_height * input1_width,
true, nullptr);
- opencl::Buffer inOutY(context.context_inst_,
+ opencl::Buffer inOutY(cl_context_ref.context_inst_,
sizeof(float) * input1_batch_size * input1_channels *
(input1_height + input2_height) * input1_width,
true, nullptr);
- result = inputA.WriteData(context.command_queue_inst_, matAdata);
+ result = inputA.WriteData(cl_context_ref.command_queue_inst_, matAdata);
if (!result) {
break;
}
- result = inputX.WriteData(context.command_queue_inst_, vecXdata);
+ result = inputX.WriteData(cl_context_ref.command_queue_inst_, vecXdata);
if (!result) {
break;
}
- result = inOutY.WriteData(context.command_queue_inst_, vecYdata);
+ result = inOutY.WriteData(cl_context_ref.command_queue_inst_, vecYdata);
if (!result) {
break;
}
- result = ConcatLayerCl::kernel_concat_axis2.SetKernelArguments(
- 0, &inputA, sizeof(cl_mem));
+ result = kernel_concat_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem));
if (!result) {
break;
}
- result = ConcatLayerCl::kernel_concat_axis2.SetKernelArguments(
- 1, &inputX, sizeof(cl_mem));
+ result = kernel_concat_ptr->SetKernelArguments(1, &inputX, sizeof(cl_mem));
if (!result) {
break;
}
- result = ConcatLayerCl::kernel_concat_axis2.SetKernelArguments(
- 2, &inOutY, sizeof(cl_mem));
+ result = kernel_concat_ptr->SetKernelArguments(2, &inOutY, sizeof(cl_mem));
if (!result) {
break;
}
- result = ConcatLayerCl::kernel_concat_axis2.SetKernelArguments(
- 3, &input1_batch_size, sizeof(int));
+ result =
+ kernel_concat_ptr->SetKernelArguments(3, &input1_batch_size, sizeof(int));
if (!result) {
break;
}
- result = ConcatLayerCl::kernel_concat_axis2.SetKernelArguments(
- 4, &input1_channels, sizeof(int));
+ result =
+ kernel_concat_ptr->SetKernelArguments(4, &input1_channels, sizeof(int));
if (!result) {
break;
}
- result = ConcatLayerCl::kernel_concat_axis2.SetKernelArguments(
- 5, &input1_height, sizeof(int));
+ result =
+ kernel_concat_ptr->SetKernelArguments(5, &input1_height, sizeof(int));
if (!result) {
break;
}
- result = ConcatLayerCl::kernel_concat_axis2.SetKernelArguments(
- 6, &input2_height, sizeof(int));
+ result =
+ kernel_concat_ptr->SetKernelArguments(6, &input2_height, sizeof(int));
if (!result) {
break;
}
- result = ConcatLayerCl::kernel_concat_axis2.SetKernelArguments(
- 7, &input1_width, sizeof(int));
+ result =
+ kernel_concat_ptr->SetKernelArguments(7, &input1_width, sizeof(int));
if (!result) {
break;
}
const int work_groups_count[3] = {dim, 1, 1};
const int work_group_size[3] = {32, 32, 1}; // test-value
- result = context.command_queue_inst_.DispatchCommand(
- ConcatLayerCl::kernel_concat_axis2, work_groups_count, work_group_size);
+ result = cl_context_ref.command_queue_inst_.DispatchCommand(
+ kernel_concat_ptr, work_groups_count, work_group_size);
if (!result) {
break;
}
- result = inOutY.ReadData(context.command_queue_inst_, vecYdata);
+ result = inOutY.ReadData(cl_context_ref.command_queue_inst_, vecYdata);
if (!result) {
break;
}
const __fp16 *matAdata, const __fp16 *vecXdata, __fp16 *vecYdata,
unsigned int input1_batch_size, unsigned int input1_channels,
unsigned int input1_width, unsigned int input1_height,
- unsigned int input2_height, RunLayerContext &context) {
+ unsigned int input2_height) {
bool result = false;
do {
- result = context.clCreateKernel(concat_cl_axis2_kernel_fp16_,
- context.LayerKernel::CONCAT_AXIS2_FP16,
- ConcatLayerCl::kernel_concat_axis2_fp16);
- if (!result) {
+ ClContext::SharedPtrClKernel kernel_concat_ptr =
+ cl_context_ref.registerClKernel(concat_cl_axis2_kernel_fp16_,
+ "concat_cl_axis2_fp16");
+ if (!kernel_concat_ptr) {
break;
}
int dim = int(input1_batch_size * input1_channels * input1_width *
(input1_height + input2_height));
- opencl::Buffer inputA(context.context_inst_,
+ opencl::Buffer inputA(cl_context_ref.context_inst_,
sizeof(__fp16) * input1_batch_size * input1_channels *
input1_height * input1_width,
true, nullptr);
- opencl::Buffer inputX(context.context_inst_,
+ opencl::Buffer inputX(cl_context_ref.context_inst_,
sizeof(__fp16) * input1_batch_size * input1_channels *
input2_height * input1_width,
true, nullptr);
- opencl::Buffer inOutY(context.context_inst_,
+ opencl::Buffer inOutY(cl_context_ref.context_inst_,
sizeof(__fp16) * input1_batch_size * input1_channels *
(input1_height + input2_height) * input1_width,
true, nullptr);
- result = inputA.WriteData(context.command_queue_inst_, matAdata);
+ result = inputA.WriteData(cl_context_ref.command_queue_inst_, matAdata);
if (!result) {
break;
}
- result = inputX.WriteData(context.command_queue_inst_, vecXdata);
+ result = inputX.WriteData(cl_context_ref.command_queue_inst_, vecXdata);
if (!result) {
break;
}
- result = inOutY.WriteData(context.command_queue_inst_, vecYdata);
+ result = inOutY.WriteData(cl_context_ref.command_queue_inst_, vecYdata);
if (!result) {
break;
}
- result = ConcatLayerCl::kernel_concat_axis2_fp16.SetKernelArguments(
- 0, &inputA, sizeof(cl_mem));
+ result = kernel_concat_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem));
if (!result) {
break;
}
- result = ConcatLayerCl::kernel_concat_axis2_fp16.SetKernelArguments(
- 1, &inputX, sizeof(cl_mem));
+ result = kernel_concat_ptr->SetKernelArguments(1, &inputX, sizeof(cl_mem));
if (!result) {
break;
}
- result = ConcatLayerCl::kernel_concat_axis2_fp16.SetKernelArguments(
- 2, &inOutY, sizeof(cl_mem));
+ result = kernel_concat_ptr->SetKernelArguments(2, &inOutY, sizeof(cl_mem));
if (!result) {
break;
}
- result = ConcatLayerCl::kernel_concat_axis2_fp16.SetKernelArguments(
- 3, &input1_batch_size, sizeof(int));
+ result =
+ kernel_concat_ptr->SetKernelArguments(3, &input1_batch_size, sizeof(int));
if (!result) {
break;
}
- result = ConcatLayerCl::kernel_concat_axis2_fp16.SetKernelArguments(
- 4, &input1_channels, sizeof(int));
+ result =
+ kernel_concat_ptr->SetKernelArguments(4, &input1_channels, sizeof(int));
if (!result) {
break;
}
- result = ConcatLayerCl::kernel_concat_axis2_fp16.SetKernelArguments(
- 5, &input1_height, sizeof(int));
+ result =
+ kernel_concat_ptr->SetKernelArguments(5, &input1_height, sizeof(int));
if (!result) {
break;
}
- result = ConcatLayerCl::kernel_concat_axis2_fp16.SetKernelArguments(
- 6, &input2_height, sizeof(int));
+ result =
+ kernel_concat_ptr->SetKernelArguments(6, &input2_height, sizeof(int));
if (!result) {
break;
}
- result = ConcatLayerCl::kernel_concat_axis2_fp16.SetKernelArguments(
- 7, &input1_width, sizeof(int));
+ result =
+ kernel_concat_ptr->SetKernelArguments(7, &input1_width, sizeof(int));
if (!result) {
break;
}
const int work_groups_count[3] = {dim, 1, 1};
const int work_group_size[3] = {32, 32, 1}; // test-value
- result = context.command_queue_inst_.DispatchCommand(
- ConcatLayerCl::kernel_concat_axis2_fp16, work_groups_count,
- work_group_size);
+ result = cl_context_ref.command_queue_inst_.DispatchCommand(
+ kernel_concat_ptr, work_groups_count, work_group_size);
if (!result) {
break;
}
- result = inOutY.ReadData(context.command_queue_inst_, vecYdata);
+ result = inOutY.ReadData(cl_context_ref.command_queue_inst_, vecYdata);
if (!result) {
break;
}
} while (false);
}
-void ConcatLayerCl::concat_cl_axis1(
- const float *matAdata, const float *vecXdata, float *vecYdata,
- unsigned int input1_batch_size, unsigned int input1_height,
- unsigned int input1_width, unsigned int input1_channels,
- unsigned int input2_channels, RunLayerContext &context) {
+void ConcatLayerCl::concat_cl_axis1(const float *matAdata,
+ const float *vecXdata, float *vecYdata,
+ unsigned int input1_batch_size,
+ unsigned int input1_height,
+ unsigned int input1_width,
+ unsigned int input1_channels,
+ unsigned int input2_channels) {
bool result = false;
do {
- result = context.clCreateKernel(concat_cl_axis1_kernel_,
- context.LayerKernel::CONCAT_AXIS1,
- ConcatLayerCl::kernel_concat_axis1);
- if (!result) {
+ ClContext::SharedPtrClKernel kernel_concat_ptr =
+ cl_context_ref.registerClKernel(concat_cl_axis1_kernel_,
+ "concat_cl_axis1");
+ if (!kernel_concat_ptr) {
break;
}
int dim = int(input1_batch_size * input1_width * input1_height *
(input1_channels + input2_channels));
- opencl::Buffer inputA(context.context_inst_,
+ opencl::Buffer inputA(cl_context_ref.context_inst_,
sizeof(float) * input1_batch_size * input1_channels *
input1_height * input1_width,
true, nullptr);
- opencl::Buffer inputX(context.context_inst_,
+ opencl::Buffer inputX(cl_context_ref.context_inst_,
sizeof(float) * input1_batch_size * input2_channels *
input1_height * input1_width,
true, nullptr);
- opencl::Buffer inOutY(context.context_inst_,
+ opencl::Buffer inOutY(cl_context_ref.context_inst_,
sizeof(float) * input1_batch_size * input1_width *
input1_height * (input1_channels + input2_channels),
true, nullptr);
- result = inputA.WriteData(context.command_queue_inst_, matAdata);
+ result = inputA.WriteData(cl_context_ref.command_queue_inst_, matAdata);
if (!result) {
break;
}
- result = inputX.WriteData(context.command_queue_inst_, vecXdata);
+ result = inputX.WriteData(cl_context_ref.command_queue_inst_, vecXdata);
if (!result) {
break;
}
- result = inOutY.WriteData(context.command_queue_inst_, vecYdata);
+ result = inOutY.WriteData(cl_context_ref.command_queue_inst_, vecYdata);
if (!result) {
break;
}
- result = ConcatLayerCl::kernel_concat_axis1.SetKernelArguments(
- 0, &inputA, sizeof(cl_mem));
+ result = kernel_concat_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem));
if (!result) {
break;
}
- result = ConcatLayerCl::kernel_concat_axis1.SetKernelArguments(
- 1, &inputX, sizeof(cl_mem));
+ result = kernel_concat_ptr->SetKernelArguments(1, &inputX, sizeof(cl_mem));
if (!result) {
break;
}
- result = ConcatLayerCl::kernel_concat_axis1.SetKernelArguments(
- 2, &inOutY, sizeof(cl_mem));
+ result = kernel_concat_ptr->SetKernelArguments(2, &inOutY, sizeof(cl_mem));
if (!result) {
break;
}
- result = ConcatLayerCl::kernel_concat_axis1.SetKernelArguments(
- 3, &input1_batch_size, sizeof(int));
+ result =
+ kernel_concat_ptr->SetKernelArguments(3, &input1_batch_size, sizeof(int));
if (!result) {
break;
}
- result = ConcatLayerCl::kernel_concat_axis1.SetKernelArguments(
- 4, &input1_channels, sizeof(int));
+ result =
+ kernel_concat_ptr->SetKernelArguments(4, &input1_channels, sizeof(int));
if (!result) {
break;
}
- result = ConcatLayerCl::kernel_concat_axis1.SetKernelArguments(
- 5, &input2_channels, sizeof(int));
+ result =
+ kernel_concat_ptr->SetKernelArguments(5, &input2_channels, sizeof(int));
if (!result) {
break;
}
- result = ConcatLayerCl::kernel_concat_axis1.SetKernelArguments(
- 6, &input1_height, sizeof(int));
+ result =
+ kernel_concat_ptr->SetKernelArguments(6, &input1_height, sizeof(int));
if (!result) {
break;
}
- result = ConcatLayerCl::kernel_concat_axis1.SetKernelArguments(
- 7, &input1_width, sizeof(int));
+ result =
+ kernel_concat_ptr->SetKernelArguments(7, &input1_width, sizeof(int));
if (!result) {
break;
}
const int work_groups_count[3] = {dim, 1, 1};
const int work_group_size[3] = {32, 32, 1}; // test-value
- result = context.command_queue_inst_.DispatchCommand(
- ConcatLayerCl::kernel_concat_axis1, work_groups_count, work_group_size);
+ result = cl_context_ref.command_queue_inst_.DispatchCommand(
+ kernel_concat_ptr, work_groups_count, work_group_size);
if (!result) {
break;
}
- result = inOutY.ReadData(context.command_queue_inst_, vecYdata);
+ result = inOutY.ReadData(cl_context_ref.command_queue_inst_, vecYdata);
if (!result) {
break;
}
const __fp16 *matAdata, const __fp16 *vecXdata, __fp16 *vecYdata,
unsigned int input1_batch_size, unsigned int input1_height,
unsigned int input1_width, unsigned int input1_channels,
- unsigned int input2_channels, RunLayerContext &context) {
+ unsigned int input2_channels) {
bool result = false;
do {
- result = context.clCreateKernel(concat_cl_axis1_kernel_fp16_,
- context.LayerKernel::CONCAT_AXIS1_FP16,
- ConcatLayerCl::kernel_concat_axis1_fp16);
- if (!result) {
+ ClContext::SharedPtrClKernel kernel_concat_ptr =
+ cl_context_ref.registerClKernel(concat_cl_axis1_kernel_fp16_,
+ "concat_cl_axis1_fp16");
+ if (!kernel_concat_ptr) {
break;
}
int dim = int(input1_batch_size * input1_width * input1_height *
(input1_channels + input2_channels));
- opencl::Buffer inputA(context.context_inst_,
+ opencl::Buffer inputA(cl_context_ref.context_inst_,
sizeof(__fp16) * input1_batch_size * input1_channels *
input1_height * input1_width,
true, nullptr);
- opencl::Buffer inputX(context.context_inst_,
+ opencl::Buffer inputX(cl_context_ref.context_inst_,
sizeof(__fp16) * input1_batch_size * input2_channels *
input1_height * input1_width,
true, nullptr);
- opencl::Buffer inOutY(context.context_inst_,
+ opencl::Buffer inOutY(cl_context_ref.context_inst_,
sizeof(__fp16) * input1_batch_size * input1_width *
input1_height * (input1_channels + input2_channels),
true, nullptr);
- result = inputA.WriteData(context.command_queue_inst_, matAdata);
+ result = inputA.WriteData(cl_context_ref.command_queue_inst_, matAdata);
if (!result) {
break;
}
- result = inputX.WriteData(context.command_queue_inst_, vecXdata);
+ result = inputX.WriteData(cl_context_ref.command_queue_inst_, vecXdata);
if (!result) {
break;
}
- result = inOutY.WriteData(context.command_queue_inst_, vecYdata);
+ result = inOutY.WriteData(cl_context_ref.command_queue_inst_, vecYdata);
if (!result) {
break;
}
- result = ConcatLayerCl::kernel_concat_axis1_fp16.SetKernelArguments(
- 0, &inputA, sizeof(cl_mem));
+ result = kernel_concat_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem));
if (!result) {
break;
}
- result = ConcatLayerCl::kernel_concat_axis1_fp16.SetKernelArguments(
- 1, &inputX, sizeof(cl_mem));
+ result = kernel_concat_ptr->SetKernelArguments(1, &inputX, sizeof(cl_mem));
if (!result) {
break;
}
- result = ConcatLayerCl::kernel_concat_axis1_fp16.SetKernelArguments(
- 2, &inOutY, sizeof(cl_mem));
+ result = kernel_concat_ptr->SetKernelArguments(2, &inOutY, sizeof(cl_mem));
if (!result) {
break;
}
- result = ConcatLayerCl::kernel_concat_axis1_fp16.SetKernelArguments(
- 3, &input1_batch_size, sizeof(int));
+ result =
+ kernel_concat_ptr->SetKernelArguments(3, &input1_batch_size, sizeof(int));
if (!result) {
break;
}
- result = ConcatLayerCl::kernel_concat_axis1_fp16.SetKernelArguments(
- 4, &input1_channels, sizeof(int));
+ result =
+ kernel_concat_ptr->SetKernelArguments(4, &input1_channels, sizeof(int));
if (!result) {
break;
}
- result = ConcatLayerCl::kernel_concat_axis1_fp16.SetKernelArguments(
- 5, &input2_channels, sizeof(int));
+ result =
+ kernel_concat_ptr->SetKernelArguments(5, &input2_channels, sizeof(int));
if (!result) {
break;
}
- result = ConcatLayerCl::kernel_concat_axis1_fp16.SetKernelArguments(
- 6, &input1_height, sizeof(int));
+ result =
+ kernel_concat_ptr->SetKernelArguments(6, &input1_height, sizeof(int));
if (!result) {
break;
}
- result = ConcatLayerCl::kernel_concat_axis1_fp16.SetKernelArguments(
- 7, &input1_width, sizeof(int));
+ result =
+ kernel_concat_ptr->SetKernelArguments(7, &input1_width, sizeof(int));
if (!result) {
break;
}
const int work_groups_count[3] = {dim, 1, 1};
const int work_group_size[3] = {32, 32, 1}; // test-value
- result = context.command_queue_inst_.DispatchCommand(
- ConcatLayerCl::kernel_concat_axis1_fp16, work_groups_count,
- work_group_size);
+ result = cl_context_ref.command_queue_inst_.DispatchCommand(
+ kernel_concat_ptr, work_groups_count, work_group_size);
if (!result) {
break;
}
- result = inOutY.ReadData(context.command_queue_inst_, vecYdata);
+ result = inOutY.ReadData(cl_context_ref.command_queue_inst_, vecYdata);
if (!result) {
break;
}
#define __CONCAT_LAYER_CL_H__
#ifdef __cplusplus
+#include <cl_context.h>
#include <common_properties.h>
#include <layer_context.h>
#include <layer_devel.h>
* @brief Concat Layer
*/
class ConcatLayerCl : public Layer {
+
+private:
+ inline static ClContext cl_context_ref;
+
public:
/**
* @brief Constructor of Concat Layer
* @param[in] input1 Tensor
* @param[in] input2 Tensor
* @param[in] result Tensor
- * @param[in] RunLayerContext reference
*/
- void ConcatProcess(Tensor const &in1, Tensor const &in2, Tensor &result,
- RunLayerContext &context);
+ void ConcatProcess(Tensor const &in1, Tensor const &in2, Tensor &result);
/**
* @brief concat computation for axis 3
* @param[in] input1_height represents the height of the input tensor
* @param[in] input1_width represents the width of the input tensor A
* @param[in] input2_width represents the width of the input tensor X
- * @param[in] context RunLayerContext reference
*/
void concat_cl_axis3(const float *matAdata, const float *vecXdata,
float *vecYdata, unsigned int input1_batch_size,
unsigned int input1_channels, unsigned int input1_height,
- unsigned int input1_width, unsigned int input2_width,
- RunLayerContext &context);
+ unsigned int input1_width, unsigned int input2_width);
+
+ /**
+ * @brief concat computation for axis 2
+ * @param[in] matAdata float * for Input Tensor A
+ * @param[in] vecXdata float * for Input Tensor X
+ * @param[in] vecYdata float * for Output Tensor Y
+ * @param[in] input1_batch_size represents the number of samples in the input
+ * tensor
+ * @param[in] input1_channels represents the channels of the input tensor
+ * @param[in] input1_width represents the width of the input tensor
+ * @param[in] input1_height represents the height of the input tensor A
+ * @param[in] input2_height represents the height of the input tensor X
+ */
+ void concat_cl_axis2(const float *matAdata, const float *vecXdata,
+ float *vecYdata, unsigned int input1_batch_size,
+ unsigned int input1_channels, unsigned int input1_width,
+ unsigned int input1_height, unsigned int input2_height);
+
+ /**
+ * @brief concat computation for axis 1
+ * @param[in] matAdata float * for Input Tensor A
+ * @param[in] vecXdata float * for Input Tensor X
+ * @param[in] vecYdata float * for Output Tensor Y
+ * @param[in] input1_batch_size represents the number of samples in the input
+ * tensor
+ * @param[in] input1_height represents the height of the input tensor
+ * @param[in] input1_width represents the width of the input tensor
+ * @param[in] input1_channels represents the channels of the input tensor A
+ * @param[in] input2_channels represents the channels of the input tensor X
+ */
+ void concat_cl_axis1(const float *matAdata, const float *vecXdata,
+ float *vecYdata, unsigned int input1_batch_size,
+ unsigned int input1_height, unsigned int input1_width,
+ unsigned int input1_channels,
+ unsigned int input2_channels);
+#ifdef ENABLE_FP16
/**
* @brief concat computation for axis 3 fp16
* @param[in] matAdata fp16 * for Input Tensor A
* @param[in] input1_height represents the height of the input tensor
* @param[in] input1_width represents the width of the input tensor A
* @param[in] input2_width represents the width of the input tensor X
- * @param[in] context RunLayerContext reference
*/
void concat_cl_axis3_fp16(const __fp16 *matAdata, const __fp16 *vecXdata,
__fp16 *vecYdata, unsigned int input1_batch_size,
unsigned int input1_channels,
unsigned int input1_height,
unsigned int input1_width,
- unsigned int input2_width,
- RunLayerContext &context);
-
- /**
- * @brief concat computation for axis 2
- * @param[in] matAdata float * for Input Tensor A
- * @param[in] vecXdata float * for Input Tensor X
- * @param[in] vecYdata float * for Output Tensor Y
- * @param[in] input1_batch_size represents the number of samples in the input
- * tensor
- * @param[in] input1_channels represents the channels of the input tensor
- * @param[in] input1_width represents the width of the input tensor
- * @param[in] input1_height represents the height of the input tensor A
- * @param[in] input2_height represents the height of the input tensor X
- * @param[in] context RunLayerContext reference
- */
- void concat_cl_axis2(const float *matAdata, const float *vecXdata,
- float *vecYdata, unsigned int input1_batch_size,
- unsigned int input1_channels, unsigned int input1_width,
- unsigned int input1_height, unsigned int input2_height,
- RunLayerContext &context);
+ unsigned int input2_width);
/**
* @brief concat computation for axis 2 fp16
* @param[in] input1_width represents the width of the input tensor
* @param[in] input1_height represents the height of the input tensor A
* @param[in] input2_height represents the height of the input tensor X
- * @param[in] context RunLayerContext reference
*/
void concat_cl_axis2_fp16(const __fp16 *matAdata, const __fp16 *vecXdata,
__fp16 *vecYdata, unsigned int input1_batch_size,
unsigned int input1_channels,
unsigned int input1_width,
unsigned int input1_height,
- unsigned int input2_height,
- RunLayerContext &context);
-
- /**
- * @brief concat computation for axis 1
- * @param[in] matAdata float * for Input Tensor A
- * @param[in] vecXdata float * for Input Tensor X
- * @param[in] vecYdata float * for Output Tensor Y
- * @param[in] input1_batch_size represents the number of samples in the input
- * tensor
- * @param[in] input1_height represents the height of the input tensor
- * @param[in] input1_width represents the width of the input tensor
- * @param[in] input1_channels represents the channels of the input tensor A
- * @param[in] input2_channels represents the channels of the input tensor X
- * @param[in] context RunLayerContext reference
- */
- void concat_cl_axis1(const float *matAdata, const float *vecXdata,
- float *vecYdata, unsigned int input1_batch_size,
- unsigned int input1_height, unsigned int input1_width,
- unsigned int input1_channels,
- unsigned int input2_channels, RunLayerContext &context);
+ unsigned int input2_height);
/**
* @brief concat computation for axis 1 fp16
* @param[in] input1_width represents the width of the input tensor
* @param[in] input1_channels represents the channels of the input tensor A
* @param[in] input2_channels represents the channels of the input tensor X
- * @param[in] context RunLayerContext reference
*/
void concat_cl_axis1_fp16(const __fp16 *matAdata, const __fp16 *vecXdata,
__fp16 *vecYdata, unsigned int input1_batch_size,
unsigned int input1_height,
unsigned int input1_width,
unsigned int input1_channels,
- unsigned int input2_channels,
- RunLayerContext &context);
-
+ unsigned int input2_channelst);
+#endif
private:
std::tuple<props::ConcatDimension> concat_props;
};
if (!context.executeInPlace()) {
Tensor &output = context.getOutput(SINGLE_INOUT_IDX);
const Tensor &input = context.getInput(SINGLE_INOUT_IDX);
- ReshapeProcess(input, output, context);
+ ReshapeProcess(input, output);
}
}
from = 0;
to = 1;
}
- ReshapeProcess(input, output, context);
+ ReshapeProcess(input, output);
}
}
opencl::Kernel ReshapeLayerCl::kernel_copy;
opencl::Kernel ReshapeLayerCl::kernel_copy_fp16;
-void ReshapeLayerCl::ReshapeProcess(Tensor const &input, Tensor &output,
- RunLayerContext &context) {
+void ReshapeLayerCl::ReshapeProcess(Tensor const &input, Tensor &output) {
unsigned int input_batch_size, input_height, input_width, input_channels;
const float *data = input.getData();
float *rdata = output.getData();
copy_cl(data, rdata, input_batch_size, input_channels, input_height,
- input_width, context);
+ input_width);
} else if (input.getDataType() == ml::train::TensorDim::DataType::FP16) {
#ifdef ENABLE_FP16
const _FP16 *data = input.getData<_FP16>();
_FP16 *rdata = output.getData<_FP16>();
copy_cl_fp16(data, rdata, input_batch_size, input_channels, input_height,
- input_width, context);
+ input_width);
#else
throw std::invalid_argument("Error: enable-fp16 is not enabled");
#endif
}
}
-void ReshapeLayerCl::copy_cl(const float *input, float *res,
- unsigned int input_batch_size,
- unsigned int input_channels,
- unsigned int input_height,
- unsigned int input_width,
- RunLayerContext &context) {
+void ReshapeLayerCl::copy_cl_fp16(const __fp16 *input, __fp16 *res,
+ unsigned int input_batch_size,
+ unsigned int input_channels,
+ unsigned int input_height,
+ unsigned int input_width) {
bool result = false;
do {
- result = context.clCreateKernel(copy_cl_kernel_, context.LayerKernel::COPY,
- ReshapeLayerCl::kernel_copy);
- if (!result) {
+ ClContext::SharedPtrClKernel kernel_copy_ptr =
+ cl_context_ref.registerClKernel(copy_cl_kernel_fp16_, "copy_cl_fp16");
+ if (!kernel_copy_ptr) {
break;
}
- size_t dim_size = sizeof(float) * input_batch_size * input_height *
+ size_t dim_size = sizeof(__fp16) * input_batch_size * input_height *
input_width * input_channels;
- opencl::Buffer inputA(context.context_inst_, dim_size, true, nullptr);
+ opencl::Buffer inputA(cl_context_ref.context_inst_, dim_size, true,
+ nullptr);
- opencl::Buffer inOutRes(context.context_inst_, dim_size, true, nullptr);
+ opencl::Buffer inOutRes(cl_context_ref.context_inst_, dim_size, true,
+ nullptr);
- result = inputA.WriteData(context.command_queue_inst_, input);
+ result = inputA.WriteData(cl_context_ref.command_queue_inst_, input);
if (!result) {
break;
}
- result = inOutRes.WriteData(context.command_queue_inst_, res);
+ result = inOutRes.WriteData(cl_context_ref.command_queue_inst_, res);
if (!result) {
break;
}
- result = ReshapeLayerCl::kernel_copy.SetKernelArguments(0, &inputA,
- sizeof(cl_mem));
+ result = kernel_copy_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem));
if (!result) {
break;
}
- result = ReshapeLayerCl::kernel_copy.SetKernelArguments(1, &inOutRes,
- sizeof(cl_mem));
+ result = kernel_copy_ptr->SetKernelArguments(1, &inOutRes, sizeof(cl_mem));
if (!result) {
break;
}
- result = ReshapeLayerCl::kernel_copy.SetKernelArguments(
- 2, &input_batch_size, sizeof(int));
+ result =
+ kernel_copy_ptr->SetKernelArguments(2, &input_batch_size, sizeof(int));
if (!result) {
break;
}
- result = ReshapeLayerCl::kernel_copy.SetKernelArguments(3, &input_channels,
- sizeof(int));
+ result =
+ kernel_copy_ptr->SetKernelArguments(3, &input_channels, sizeof(int));
if (!result) {
break;
}
- result = ReshapeLayerCl::kernel_copy.SetKernelArguments(4, &input_height,
- sizeof(int));
+ result = kernel_copy_ptr->SetKernelArguments(4, &input_height, sizeof(int));
if (!result) {
break;
}
- result = ReshapeLayerCl::kernel_copy.SetKernelArguments(5, &input_width,
- sizeof(int));
+ result = kernel_copy_ptr->SetKernelArguments(5, &input_width, sizeof(int));
if (!result) {
break;
}
const int work_groups_count[3] = {(int)dim_size, 1, 1};
const int work_group_size[3] = {32, 32, 1}; // test-value
- result = context.command_queue_inst_.DispatchCommand(
- ReshapeLayerCl::kernel_copy, work_groups_count, work_group_size);
+ result = cl_context_ref.command_queue_inst_.DispatchCommand(
+ kernel_copy_ptr, work_groups_count, work_group_size);
if (!result) {
break;
}
- result = inOutRes.ReadData(context.command_queue_inst_, res);
+ result = inOutRes.ReadData(cl_context_ref.command_queue_inst_, res);
if (!result) {
break;
}
} while (false);
}
-void ReshapeLayerCl::copy_cl_fp16(const __fp16 *input, __fp16 *res,
- unsigned int input_batch_size,
- unsigned int input_channels,
- unsigned int input_height,
- unsigned int input_width,
- RunLayerContext &context) {
+void ReshapeLayerCl::copy_cl(const float *input, float *res,
+ unsigned int input_batch_size,
+ unsigned int input_channels,
+ unsigned int input_height,
+ unsigned int input_width) {
bool result = false;
do {
- result = context.clCreateKernel(copy_cl_kernel_fp16_,
- context.LayerKernel::COPY_FP16,
- ReshapeLayerCl::kernel_copy_fp16);
- if (!result) {
+ ClContext::SharedPtrClKernel kernel_copy_ptr =
+ cl_context_ref.registerClKernel(copy_cl_kernel_, "copy_cl");
+ if (!kernel_copy_ptr) {
break;
}
- size_t dim_size = sizeof(__fp16) * input_batch_size * input_height *
+ size_t dim_size = sizeof(float) * input_batch_size * input_height *
input_width * input_channels;
- opencl::Buffer inputA(context.context_inst_, dim_size, true, nullptr);
+ opencl::Buffer inputA(cl_context_ref.context_inst_, dim_size, true,
+ nullptr);
- opencl::Buffer inOutRes(context.context_inst_, dim_size, true, nullptr);
+ opencl::Buffer inOutRes(cl_context_ref.context_inst_, dim_size, true,
+ nullptr);
- result = inputA.WriteData(context.command_queue_inst_, input);
+ result = inputA.WriteData(cl_context_ref.command_queue_inst_, input);
if (!result) {
break;
}
- result = inOutRes.WriteData(context.command_queue_inst_, res);
+ result = inOutRes.WriteData(cl_context_ref.command_queue_inst_, res);
if (!result) {
break;
}
- result = ReshapeLayerCl::kernel_copy_fp16.SetKernelArguments(
- 0, &inputA, sizeof(cl_mem));
+ result = kernel_copy_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem));
if (!result) {
break;
}
- result = ReshapeLayerCl::kernel_copy_fp16.SetKernelArguments(
- 1, &inOutRes, sizeof(cl_mem));
+ result = kernel_copy_ptr->SetKernelArguments(1, &inOutRes, sizeof(cl_mem));
if (!result) {
break;
}
- result = ReshapeLayerCl::kernel_copy_fp16.SetKernelArguments(
- 2, &input_batch_size, sizeof(int));
+ result =
+ kernel_copy_ptr->SetKernelArguments(2, &input_batch_size, sizeof(int));
if (!result) {
break;
}
- result = ReshapeLayerCl::kernel_copy_fp16.SetKernelArguments(
- 3, &input_channels, sizeof(int));
+ result =
+ kernel_copy_ptr->SetKernelArguments(3, &input_channels, sizeof(int));
if (!result) {
break;
}
- result = ReshapeLayerCl::kernel_copy_fp16.SetKernelArguments(
- 4, &input_height, sizeof(int));
+ result = kernel_copy_ptr->SetKernelArguments(4, &input_height, sizeof(int));
if (!result) {
break;
}
- result = ReshapeLayerCl::kernel_copy_fp16.SetKernelArguments(
- 5, &input_width, sizeof(int));
+ result = kernel_copy_ptr->SetKernelArguments(5, &input_width, sizeof(int));
if (!result) {
break;
}
const int work_groups_count[3] = {(int)dim_size, 1, 1};
const int work_group_size[3] = {32, 32, 1}; // test-value
- result = context.command_queue_inst_.DispatchCommand(
- ReshapeLayerCl::kernel_copy_fp16, work_groups_count, work_group_size);
+ result = cl_context_ref.command_queue_inst_.DispatchCommand(
+ kernel_copy_ptr, work_groups_count, work_group_size);
if (!result) {
break;
}
- result = inOutRes.ReadData(context.command_queue_inst_, res);
+ result = inOutRes.ReadData(cl_context_ref.command_queue_inst_, res);
if (!result) {
break;
}
Tensor &in1 = context.getInput(INPUT_IDX_1);
Tensor &in2 = context.getInput(INPUT_IDX_2);
Tensor &out = context.getOutput(OUT_IDX);
- swigluProcess(in1, in2, out, context);
+ swigluProcess(in1, in2, out);
}
void SwiGLULayerCl::incremental_forwarding(RunLayerContext &context,
to = 1;
}
- swigluProcess(in1, in2, out, context);
+ swigluProcess(in1, in2, out);
}
opencl::Kernel SwiGLULayerCl::kernel_swiglu;
opencl::Kernel SwiGLULayerCl::kernel_swiglu_fp16;
void SwiGLULayerCl::swigluProcess(Tensor const &in1, Tensor const &in2,
- Tensor &result, RunLayerContext &context) {
+ Tensor &result) {
unsigned int dim1, dim2;
dim1 = in1.batch() * in1.channel() * in1.height();
const float *data1 = in1.getData();
const float *data2 = in2.getData();
float *rdata = result.getData();
- swiglu_cl(data1, data2, rdata, dim1, dim2, context);
+ swiglu_cl(data1, data2, rdata, dim1, dim2);
} else if (in1.getDataType() == ml::train::TensorDim::DataType::FP16) {
#ifdef ENABLE_FP16
const _FP16 *data1 = in1.getData<_FP16>();
const _FP16 *data2 = in2.getData<_FP16>();
_FP16 *rdata = result.getData<_FP16>();
- swiglu_cl_fp16(data1, data2, rdata, dim1, dim2, context);
+ swiglu_cl_fp16(data1, data2, rdata, dim1, dim2);
#else
throw std::invalid_argument("Error: enable-fp16 is not enabled");
#endif
void SwiGLULayerCl::swiglu_cl(const float *matAdata, const float *vecXdata,
float *vecYdata, unsigned int dim1,
- unsigned int dim2, RunLayerContext &context) {
+ unsigned int dim2) {
bool result = false;
do {
- result =
- context.clCreateKernel(swiglu_cl_kernel_, context.LayerKernel::SWIGLU,
- SwiGLULayerCl::kernel_swiglu);
- if (!result) {
+ ClContext::SharedPtrClKernel kernel_swiglu_ptr =
+ cl_context_ref.registerClKernel(swiglu_cl_kernel_, "swiglu_cl");
+ if (!kernel_swiglu_ptr) {
break;
}
int dim = int(dim1 * dim2);
- opencl::Buffer inputA(context.context_inst_, sizeof(float) * dim1 * dim2, true,
- nullptr);
+ opencl::Buffer inputA(cl_context_ref.context_inst_,
+ sizeof(float) * dim1 * dim2, true, nullptr);
- opencl::Buffer inputX(context.context_inst_, sizeof(float) * dim1 * dim2, true,
- nullptr);
+ opencl::Buffer inputX(cl_context_ref.context_inst_,
+ sizeof(float) * dim1 * dim2, true, nullptr);
- opencl::Buffer inOutY(context.context_inst_, sizeof(float) * dim1 * dim2, true,
- nullptr);
+ opencl::Buffer inOutY(cl_context_ref.context_inst_,
+ sizeof(float) * dim1 * dim2, true, nullptr);
- result = inputA.WriteData(context.command_queue_inst_, matAdata);
+ result = inputA.WriteData(cl_context_ref.command_queue_inst_, matAdata);
if (!result) {
break;
}
- result = inputX.WriteData(context.command_queue_inst_, vecXdata);
+ result = inputX.WriteData(cl_context_ref.command_queue_inst_, vecXdata);
if (!result) {
break;
}
- result = inOutY.WriteData(context.command_queue_inst_, vecYdata);
+ result = inOutY.WriteData(cl_context_ref.command_queue_inst_, vecYdata);
if (!result) {
break;
}
- result = SwiGLULayerCl::kernel_swiglu.SetKernelArguments(0, &inputA,
- sizeof(cl_mem));
+ result = kernel_swiglu_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem));
if (!result) {
break;
}
- result = SwiGLULayerCl::kernel_swiglu.SetKernelArguments(1, &inputX,
- sizeof(cl_mem));
+ result = kernel_swiglu_ptr->SetKernelArguments(1, &inputX, sizeof(cl_mem));
if (!result) {
break;
}
- result = SwiGLULayerCl::kernel_swiglu.SetKernelArguments(2, &inOutY,
- sizeof(cl_mem));
+ result = kernel_swiglu_ptr->SetKernelArguments(2, &inOutY, sizeof(cl_mem));
if (!result) {
break;
}
const int work_groups_count[3] = {dim, 1, 1};
const int work_group_size[3] = {32, 32, 1}; // test-value
- result = context.command_queue_inst_.DispatchCommand(
- SwiGLULayerCl::kernel_swiglu, work_groups_count, work_group_size);
+ result = cl_context_ref.command_queue_inst_.DispatchCommand(
+ kernel_swiglu_ptr, work_groups_count, work_group_size);
if (!result) {
break;
}
- result = inOutY.ReadData(context.command_queue_inst_, vecYdata);
+ result = inOutY.ReadData(cl_context_ref.command_queue_inst_, vecYdata);
if (!result) {
break;
}
void SwiGLULayerCl::swiglu_cl_fp16(const __fp16 *matAdata,
const __fp16 *vecXdata, __fp16 *vecYdata,
- unsigned int dim1, unsigned int dim2,
- RunLayerContext &context) {
+ unsigned int dim1, unsigned int dim2) {
bool result = false;
do {
- result = context.clCreateKernel(swiglu_cl_kernel_fp16_,
- context.LayerKernel::SWIGLU_FP16,
- SwiGLULayerCl::kernel_swiglu_fp16);
- if (!result) {
+ ClContext::SharedPtrClKernel kernel_swiglu_ptr =
+ cl_context_ref.registerClKernel(swiglu_cl_kernel_fp16_, "swiglu_cl_fp16");
+ if (!kernel_swiglu_ptr) {
break;
}
-
+
int dim = int(dim1 * dim2);
- opencl::Buffer inputA(context.context_inst_, sizeof(__fp16) * dim1 * dim2, true,
- nullptr);
+ opencl::Buffer inputA(cl_context_ref.context_inst_,
+ sizeof(__fp16) * dim1 * dim2, true, nullptr);
- opencl::Buffer inputX(context.context_inst_, sizeof(__fp16) * dim1 * dim2, true,
- nullptr);
+ opencl::Buffer inputX(cl_context_ref.context_inst_,
+ sizeof(__fp16) * dim1 * dim2, true, nullptr);
- opencl::Buffer inOutY(context.context_inst_, sizeof(__fp16) * dim1 * dim2, true,
- nullptr);
+ opencl::Buffer inOutY(cl_context_ref.context_inst_,
+ sizeof(__fp16) * dim1 * dim2, true, nullptr);
- result = inputA.WriteData(context.command_queue_inst_, matAdata);
+ result = inputA.WriteData(cl_context_ref.command_queue_inst_, matAdata);
if (!result) {
break;
}
- result = inputX.WriteData(context.command_queue_inst_, vecXdata);
+ result = inputX.WriteData(cl_context_ref.command_queue_inst_, vecXdata);
if (!result) {
break;
}
- result = inOutY.WriteData(context.command_queue_inst_, vecYdata);
+ result = inOutY.WriteData(cl_context_ref.command_queue_inst_, vecYdata);
if (!result) {
break;
}
- result = SwiGLULayerCl::kernel_swiglu_fp16.SetKernelArguments(
- 0, &inputA, sizeof(cl_mem));
+ result = kernel_swiglu_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem));
if (!result) {
break;
}
- result = SwiGLULayerCl::kernel_swiglu_fp16.SetKernelArguments(
- 1, &inputX, sizeof(cl_mem));
+ result = kernel_swiglu_ptr->SetKernelArguments(1, &inputX, sizeof(cl_mem));
if (!result) {
break;
}
- result = SwiGLULayerCl::kernel_swiglu_fp16.SetKernelArguments(
- 2, &inOutY, sizeof(cl_mem));
+ result = kernel_swiglu_ptr->SetKernelArguments(2, &inOutY, sizeof(cl_mem));
if (!result) {
break;
}
const int work_groups_count[3] = {dim, 1, 1};
const int work_group_size[3] = {32, 32, 1}; // test-value
- result = context.command_queue_inst_.DispatchCommand(
- SwiGLULayerCl::kernel_swiglu_fp16, work_groups_count, work_group_size);
+ result = cl_context_ref.command_queue_inst_.DispatchCommand(
+ kernel_swiglu_ptr, work_groups_count, work_group_size);
if (!result) {
break;
}
- result = inOutY.ReadData(context.command_queue_inst_, vecYdata);
+ result = inOutY.ReadData(cl_context_ref.command_queue_inst_, vecYdata);
if (!result) {
break;
}
return layer;
}
-void destroy_swiglu_layer_cl(Layer *layer) {
- delete layer;
-}
+void destroy_swiglu_layer_cl(Layer *layer) { delete layer; }
extern "C" {
LayerPluggable ml_train_layer_pluggable{create_swiglu_layer_cl,