Added blas_kernels_fp16.cpp for fp16 kernels.
fp16 unit tests added.
Signed-off-by: Debadri Samaddar <s.debadri@samsung.com>
std::string sgemv_cl_kernel_ =
R"(__kernel void sgemv_cl(const __global float* A, const __global float* X,
- __global float* Y, unsigned int M, unsigned int N) {
+ __global float* Y, unsigned int M, unsigned int lda) {
unsigned int i;
i = get_global_id(0);
float y0 = 0.0f;
for (unsigned int j = 0; j < M; j++)
- y0 += A[i + j * N] * X[j];
+ y0 += A[i + j * lda] * X[j];
Y[i] = y0;
})";
})";
/**
- * @brief declaring global kernel objects
+ * @brief defining global kernel objects
*/
opencl::Kernel kernel_sgemv;
opencl::Kernel kernel_sgemm;
break;
}
- result = kernel_sgemv.SetKernelArguments(4, &dim2, sizeof(int));
+ result = kernel_sgemv.SetKernelArguments(4, &lda, sizeof(int));
if (!result) {
break;
}
} while (false);
}
+
} // namespace nntrainer
* @brief declaring global kernel objects
*/
extern opencl::Kernel kernel_sgemv;
+extern opencl::Kernel kernel_sgemv_fp16;
extern opencl::Kernel kernel_sgemm;
+extern opencl::Kernel kernel_sgemm_fp16;
extern opencl::Kernel kernel_dot;
+extern opencl::Kernel kernel_dot_fp16;
/**
* @brief sgemv computation : Y = A*X + Y
unsigned int dim1, unsigned int dim2, unsigned int lda,
RunLayerContext &context);
+/**
+ * @brief fp16 sgemv computation : Y = A*X + Y
+ * @param[in] matAdata fp16 * for Matrix A
+ * @param[in] vecXdata fp16 * for Vector X
+ * @param[in] vecYdata fp16 * for Vector Y
+ * @param[in] dim1 number of A's columns
+ * @param[in] dim2 number of A's rows
+ * @param[in] lda number of X's columns
+ * @param[in] context RunLayerContext reference
+ */
+void sgemv_cl(const __fp16 *matAdata, const __fp16 *vecXdata, __fp16 *vecYdata,
+ unsigned int dim1, unsigned int dim2, unsigned int lda,
+ RunLayerContext &context);
+
/**
* @brief dot computation : sum of all X * Y
* @param[in] vecAdata float * for Vector A
* @param[in] vecXdata float * for Vector X
* @param[in] dim1 number of elements in both input vectors
* @param[in] context RunLayerContext reference
+ * @return float dot product result
*/
float dot_cl(const float *vecAdata, const float *vecXdata, unsigned int dim1,
RunLayerContext &context);
+/**
+ * @brief fp16 dot computation : sum of all X * Y
+ * @param[in] vecAdata fp16 * for Vector A
+ * @param[in] vecXdata fp16 * for Vector X
+ * @param[in] dim1 number of elements in both input vectors
+ * @param[in] context RunLayerContext reference
+ * @return fp16 dot product result
+ */
+__fp16 dot_cl(const __fp16 *vecAdata, const __fp16 *vecXdata, unsigned int dim1,
+ RunLayerContext &context);
+
/**
* @brief sgemm computation : Y = op(A)*op(B) + C,
* where op(X) is one of X or X**T
unsigned int N, unsigned int K, unsigned int lda,
unsigned int ldb, unsigned int ldc, RunLayerContext &context);
+/**
+ * @brief fp16 sgemm computation : Y = op(A)*op(B) + C,
+ * where op(X) is one of X or X**T
+ * @param[in] A fp16 * for Matrix A
+ * @param[in] B fp16 * for Matrix B
+ * @param[in] C fp16 * for Matrix C
+ * @param[in] M number of op(A)'s and C's row
+ * @param[in] N number of op(B)'s and C's columns
+ * @param[in] K number of op(A)'s and columns and op(B)'s rows
+ * @param[in] lda number of A's columns
+ * @param[in] ldb number of B's columns
+ * @param[in] ldc number of C's columns
+ * @param[in] context RunLayerContext reference
+ */
+void sgemm_cl(const __fp16 *A, const __fp16 *B, __fp16 *C, unsigned int M,
+ unsigned int N, unsigned int K, unsigned int lda,
+ unsigned int ldb, unsigned int ldc, RunLayerContext &context);
+
} // namespace nntrainer
#endif /* __BLAS_KERNELS_H__ */
--- /dev/null
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2024 Debadri Samaddar <s.debadri@samsung.com>
+ *
+ * @file blas_kernels_fp16.cpp
+ * @date 29 May 2024
+ * @brief Common blas OpenCL fp16 kernels
+ * @see https://github.com/nnstreamer/nntrainer
+ * @author Debadri Samaddar <s.debadri@samsung.com>
+ * @bug No known bugs except for NYI items
+ *
+ */
+
+#include <blas_kernels.h>
+
+namespace nntrainer {
+
+std::string sgemv_cl_kernel_fp16_ =
+ R"(
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+ __kernel void sgemv_cl_fp16(const __global half* A, const __global half* X,
+ __global half* Y, unsigned int M, unsigned int lda) {
+ unsigned int i;
+ i = get_global_id(0);
+ half y0 = 0.0f;
+ for (unsigned int j = 0; j < M; j++)
+ y0 += A[i + j * lda] * X[j];
+ Y[i] = y0;
+
+ })";
+
+std::string dot_cl_kernel_fp16_ =
+ R"(
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+ __kernel void dot_cl_fp16(const __global half* A, const __global half* X, unsigned int K, __global half* res) {
+ *res = 0;
+ for (unsigned int i = 0; i < K; i++){
+ *res += A[i] * X[i];
+ }
+ })";
+
+std::string sgemm_cl_kernel_fp16_ =
+ R"(
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+ __kernel void sgemm_cl_fp16(const __global half* A, const __global half* B,
+ __global half* C, unsigned int K, unsigned int lda, unsigned int ldb, unsigned int ldc) {
+
+ unsigned int m = get_global_id(0);
+ unsigned int n = get_global_id(1);
+ half c = 0.0f;
+ for (unsigned int k = 0; k < K; ++k) {
+ half a, b;
+ a = A[m * lda + k];
+ b = B[k * ldb + n];
+ c += a * b;
+ }
+ C[m * ldc + n] = c;
+ })";
+
+/**
+ * @brief defining global kernel objects
+ */
+opencl::Kernel kernel_sgemv_fp16;
+opencl::Kernel kernel_sgemm_fp16;
+opencl::Kernel kernel_dot_fp16;
+
+void sgemv_cl(const __fp16 *matAdata, const __fp16 *vecXdata, __fp16 *vecYdata,
+ unsigned int dim1, unsigned int dim2, unsigned int lda,
+ RunLayerContext &context) {
+
+ bool result = false;
+
+ do {
+ result = context.clCreateKernel(sgemv_cl_kernel_fp16_,
+ context.LayerKernel::SGEMV_FP16,
+ kernel_sgemv_fp16);
+ if (!result) {
+ break;
+ }
+
+ size_t dim1_size = sizeof(cl_half) * dim1;
+ size_t dim2_size = sizeof(cl_half) * dim2;
+ opencl::Buffer inputA(context.context_inst_, dim1 * dim2 * sizeof(cl_half),
+ true, nullptr);
+
+ opencl::Buffer inputX(context.context_inst_, dim1_size, true, nullptr);
+
+ opencl::Buffer inOutY(context.context_inst_, dim2_size, true, nullptr);
+
+ result = inputA.WriteData(context.command_queue_inst_, matAdata);
+ if (!result) {
+ break;
+ }
+
+ result = inputX.WriteData(context.command_queue_inst_, vecXdata);
+ if (!result) {
+ break;
+ }
+
+ result = inOutY.WriteData(context.command_queue_inst_, vecYdata);
+ if (!result) {
+ break;
+ }
+
+ result = kernel_sgemv_fp16.SetKernelArguments(0, &inputA, sizeof(cl_mem));
+ if (!result) {
+ break;
+ }
+
+ result = kernel_sgemv_fp16.SetKernelArguments(1, &inputX, sizeof(cl_mem));
+ if (!result) {
+ break;
+ }
+
+ result = kernel_sgemv_fp16.SetKernelArguments(2, &inOutY, sizeof(cl_mem));
+ if (!result) {
+ break;
+ }
+
+ result = kernel_sgemv_fp16.SetKernelArguments(3, &dim1, sizeof(int));
+ if (!result) {
+ break;
+ }
+
+ result = kernel_sgemv_fp16.SetKernelArguments(4, &lda, sizeof(int));
+ if (!result) {
+ break;
+ }
+
+ const int work_groups_count[3] = {(int)dim2, 1, 1};
+ const int work_group_size[3] = {32, 32, 1}; // test-value
+
+ result = context.command_queue_inst_.DispatchCommand(
+ kernel_sgemv_fp16, work_groups_count, work_group_size);
+ if (!result) {
+ break;
+ }
+
+ result = inOutY.ReadData(context.command_queue_inst_, vecYdata);
+ if (!result) {
+ break;
+ }
+
+ } while (false);
+}
+
+__fp16 dot_cl(const __fp16 *vecAdata, const __fp16 *vecXdata, unsigned int dim1,
+ RunLayerContext &context) {
+
+ bool result = false;
+
+ __fp16 cl_ret = 0;
+
+ do {
+ result = context.clCreateKernel(
+ dot_cl_kernel_fp16_, context.LayerKernel::DOT_FP16, kernel_dot_fp16);
+ if (!result) {
+ break;
+ }
+
+ size_t dim1_size = sizeof(cl_half) * dim1;
+
+ opencl::Buffer inputA(context.context_inst_, dim1_size, true, nullptr);
+
+ opencl::Buffer inputX(context.context_inst_, dim1_size, true, nullptr);
+
+ opencl::Buffer dotResult(context.context_inst_, sizeof(__fp16), true,
+ &cl_ret);
+
+ result = inputA.WriteData(context.command_queue_inst_, vecAdata);
+ if (!result) {
+ break;
+ }
+
+ result = inputX.WriteData(context.command_queue_inst_, vecXdata);
+ if (!result) {
+ break;
+ }
+
+ result = kernel_dot_fp16.SetKernelArguments(0, &inputA, sizeof(cl_mem));
+ if (!result) {
+ break;
+ }
+
+ result = kernel_dot_fp16.SetKernelArguments(1, &inputX, sizeof(cl_mem));
+ if (!result) {
+ break;
+ }
+
+ result = kernel_dot_fp16.SetKernelArguments(2, &dim1, sizeof(int));
+ if (!result) {
+ break;
+ }
+
+ result = kernel_dot_fp16.SetKernelArguments(3, &dotResult, sizeof(cl_mem));
+ if (!result) {
+ break;
+ }
+
+ const int work_groups_count[3] = {(int)dim1, 1, 1};
+ const int work_group_size[3] = {32, 32, 1}; // test-value
+
+ result = context.command_queue_inst_.DispatchCommand(
+ kernel_dot_fp16, work_groups_count, work_group_size);
+ if (!result) {
+ break;
+ }
+
+ result = dotResult.ReadData(context.command_queue_inst_, &cl_ret);
+ if (!result) {
+ break;
+ }
+
+ } while (false);
+
+ return cl_ret;
+}
+
+void sgemm_cl(const __fp16 *A, const __fp16 *B, __fp16 *C, unsigned int M,
+ unsigned int N, unsigned int K, unsigned int lda,
+ unsigned int ldb, unsigned int ldc, RunLayerContext &context) {
+
+ bool result = false;
+
+ do {
+ result = context.clCreateKernel(sgemm_cl_kernel_fp16_,
+ context.LayerKernel::SGEMM_FP16,
+ kernel_sgemm_fp16);
+ if (!result) {
+ break;
+ }
+
+ size_t m_k_size = M * K * sizeof(cl_half);
+ size_t k_n_size = K * N * sizeof(cl_half);
+ size_t m_n_size = M * N * sizeof(cl_half);
+
+ opencl::Buffer inputA(context.context_inst_, m_k_size, true, nullptr);
+
+ opencl::Buffer inputB(context.context_inst_, k_n_size, true, nullptr);
+
+ opencl::Buffer inOutC(context.context_inst_, m_n_size, true, nullptr);
+
+ result = inputA.WriteData(context.command_queue_inst_, A);
+ if (!result) {
+ break;
+ }
+
+ result = inputB.WriteData(context.command_queue_inst_, B);
+ if (!result) {
+ break;
+ }
+
+ result = inOutC.WriteData(context.command_queue_inst_, C);
+ if (!result) {
+ break;
+ }
+
+ result = kernel_sgemm_fp16.SetKernelArguments(0, &inputA, sizeof(cl_mem));
+ if (!result) {
+ break;
+ }
+
+ result = kernel_sgemm_fp16.SetKernelArguments(1, &inputB, sizeof(cl_mem));
+ if (!result) {
+ break;
+ }
+
+ result = kernel_sgemm_fp16.SetKernelArguments(2, &inOutC, sizeof(cl_mem));
+ if (!result) {
+ break;
+ }
+
+ result = kernel_sgemm_fp16.SetKernelArguments(3, &K, sizeof(int));
+ if (!result) {
+ break;
+ }
+
+ result = kernel_sgemm_fp16.SetKernelArguments(4, &lda, sizeof(int));
+ if (!result) {
+ break;
+ }
+
+ result = kernel_sgemm_fp16.SetKernelArguments(5, &ldb, sizeof(int));
+ if (!result) {
+ break;
+ }
+
+ result = kernel_sgemm_fp16.SetKernelArguments(6, &ldc, sizeof(int));
+ if (!result) {
+ break;
+ }
+
+ const int work_groups_count[3] = {(int)M, (int)N, 1};
+ const int work_group_size[3] = {32, 32, 1}; // test-value
+
+ result = context.command_queue_inst_.DispatchCommand(
+ kernel_sgemm_fp16, work_groups_count, work_group_size);
+ if (!result) {
+ break;
+ }
+
+ result = inOutC.ReadData(context.command_queue_inst_, C);
+ if (!result) {
+ break;
+ }
+
+ } while (false);
+}
+} // namespace nntrainer
else {
sgemm_cl(data, mdata, rdata, M, N, K, lda, ldb, ldc, context);
}
- } else
- throw std::invalid_argument("Error: OpenCL fp16 is not supported yet.");
+ } else if (input.getDataType() == ml::train::TensorDim::DataType::FP16) {
+#ifdef ENABLE_FP16
+ const _FP16 *data = input.getData<_FP16>();
+ const _FP16 *mdata = weight.getData<_FP16>();
+ _FP16 *rdata = result.getData<_FP16>();
+ const float alpha = 1.0f;
+
+ /// shortcut handling in case of vector
+ /// for vector, (1 * K) == (K * 1) in current memory layout...
+ /// and plaese note that N, K, M is a fixed place holder after considering
+ /// transpose.
+ /// For example, there is no case like (1 * K) X (1 * K) while
+ /// (1 * K) X (1 * M) can be a case
+ /// case1: (1 * K) X (K * 1)
+ if (M == 1 && N == 1) {
+ *rdata = dot_cl(data, mdata, K, context) + (*rdata);
+ }
+ /// case2: (M * K) X (K * 1)
+ else if (N == 1) {
+ sgemv_cl(data, mdata, rdata, dim1, dim2, lda, context);
+ }
+ /// case3: (1 * K) X (K * N) = 1 * N = R
+ /// = R^T = (K * N) ^T * (1 * K) ^T = (N * K) * (K * 1) = (N * K) * (1 * K)
+ /// Effectively a translation of sgemv
+ else if (M == 1) {
+ sgemv_cl(mdata, data, rdata, mdim1, mdim2, ldb, context);
+ }
+ /// case others: use sgemm
+ else {
+ sgemm_cl(data, mdata, rdata, M, N, K, lda, ldb, ldc, context);
+ }
+#else
+ throw std::invalid_argument("Error: enable-fp16 is not enabled");
+#endif
+ }
}
void FullyConnectedLayerCl::incremental_forwarding(RunLayerContext &context,
input_step_dim.height(to - from);
hidden_step_dim.height(to - from);
- // @todo: set reset stride as false. This implementation only works when batch
- // size is 1
+ // @todo: set reset stride as false. This implementation only works when
+ // batch size is 1
Tensor input_step = input_.getSharedDataTensor(input_step_dim, 0, true);
Tensor hidden_step = hidden_.getSharedDataTensor(hidden_step_dim, 0, true);
cl_layer_sources = [
'fc_layer_cl.cpp',
- 'blas_kernels.cpp'
+ 'blas_kernels.cpp',
]
+if get_option('enable-fp16')
+ cl_layer_sources += 'blas_kernels_fp16.cpp'
+endif
+
foreach s : cl_layer_sources
nntrainer_sources += meson.current_source_dir() / s
endforeach
return "dot_cl";
case LayerKernel::SGEMM:
return "sgemm_cl";
+ case LayerKernel::SGEMV_FP16:
+ return "sgemv_cl_fp16";
+ case LayerKernel::DOT_FP16:
+ return "dot_cl_fp16";
+ case LayerKernel::SGEMM_FP16:
+ return "sgemm_cl_fp16";
default:
return "";
}
* getKernelName function.
*/
enum LayerKernel {
- SGEMV = 1, /**< placeholder for kernel name */
- DOT = 2, /**< placeholder for kernel name */
- SGEMM = 4 /**< placeholder for kernel name */
+ SGEMV = 1 << 0, /**< placeholder for kernel name */
+ DOT = 1 << 1, /**< placeholder for kernel name */
+ SGEMM = 1 << 2, /**< placeholder for kernel name */
+ SGEMV_FP16 = 1 << 3, /**< placeholder for kernel name */
+ DOT_FP16 = 1 << 4, /**< placeholder for kernel name */
+ SGEMM_FP16 = 1 << 5, /**< placeholder for kernel name */
};
/**
fc_gpu_no_decay, fc_gpu_plain_nhwc,
fc_gpu_single_batch_nhwc,
fc_gpu_no_decay_nhwc));
+
+#ifdef ENABLE_FP16
+auto fc_gpu_basic_plain_w16a16 = LayerGoldenTestParamType(
+ nntrainer::createLayer<nntrainer::FullyConnectedLayerCl>, {"unit=5"},
+ "3:1:1:10", "fc_plain_w16a16.nnlayergolden",
+ LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16");
+
+auto fc_gpu_basic_single_batch_w16a16 = LayerGoldenTestParamType(
+ nntrainer::createLayer<nntrainer::FullyConnectedLayerCl>, {"unit=4"},
+ "1:1:1:10", "fc_single_batch_w16a16.nnlayergolden",
+ LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16");
+
+auto fc_gpu_basic_no_decay_w16a16 = LayerGoldenTestParamType(
+ nntrainer::createLayer<nntrainer::FullyConnectedLayerCl>,
+ {"unit=5", "weight_decay=0.0", "bias_decay=0.0"}, "3:1:1:10",
+ "fc_plain_w16a16.nnlayergolden",
+ LayerGoldenTestParamOptions::SKIP_CALC_DERIV |
+ LayerGoldenTestParamOptions::SKIP_CALC_GRAD |
+ LayerGoldenTestParamOptions::USE_INC_FORWARD,
+ "nchw", "fp16", "fp16");
+
+GTEST_PARAMETER_TEST(FullyConnected16, LayerGoldenTest,
+ ::testing::Values(fc_gpu_basic_plain_w16a16,
+ fc_gpu_basic_single_batch_w16a16,
+ fc_gpu_basic_no_decay_w16a16));
+#endif