*/
#include <addition_layer_cl.h>
+#include <blas_kernel_strings.h>
#include <cl_context.h>
#include <concat_cl.h>
#include <fc_layer_cl.h>
return assigned_int_key;
}
+void ClContext::initBlasClKernels() {
+ if (blas_kernels_initialized) {
+ ml_logi(
+ "ClContext: Default blas kernels already registered and initialized");
+ return;
+ }
+
+ registerClKernel(sgemv_cl_kernel_, "sgemv_cl");
+ registerClKernel(dot_cl_kernel_, "dot_cl");
+ registerClKernel(sgemm_cl_noTrans_kernel_, "sgemm_cl_noTrans");
+ registerClKernel(sgemm_cl_transA_kernel_, "sgemm_cl_transA");
+ registerClKernel(sgemm_cl_transB_kernel_, "sgemm_cl_transB");
+ registerClKernel(sgemm_cl_transAB_kernel_, "sgemm_cl_transAB");
+ registerClKernel(addition_cl_kernel_, "addition_cl");
+ registerClKernel(sscal_cl_kernel_, "sscal_cl");
+
+#ifdef ENABLE_FP16
+ registerClKernel(sgemv_cl_kernel_fp16_, "sgemv_cl_fp16");
+ registerClKernel(dot_cl_kernel_fp16_, "dot_cl_fp16");
+ registerClKernel(sgemm_cl_noTrans_kernel_fp16_, "sgemm_cl_noTrans_fp16");
+ registerClKernel(sgemm_cl_transA_kernel_fp16_, "sgemm_cl_transA_fp16");
+ registerClKernel(sgemm_cl_transB_kernel_fp16_, "sgemm_cl_transB_fp16");
+ registerClKernel(sgemm_cl_transAB_kernel_fp16_, "sgemm_cl_transAB_fp16");
+ registerClKernel(addition_cl_kernel_fp16_, "addition_cl_fp16");
+ registerClKernel(sscal_cl_kernel_fp16_, "sscal_cl_fp16");
+#endif
+ blas_kernels_initialized = true;
+}
+
const ClContext::SharedPtrClKernel &
ClContext::registerClKernel(std::string kernel_string,
std::string kernel_name) {
const SharedPtrClKernel ®isterClKernel(std::string kernel_string,
std::string kernel_name);
+ /**
+ * @brief Initialize and register all blas OpenCl kernels
+ */
+ void initBlasClKernels();
+
/**
* @brief destructor to release opencl commandQueue
*/
// flag to check opencl commandqueue and context inititalization
bool cl_initialized = false;
+ // flag to check default blas kernels registered or not
+ bool blas_kernels_initialized = false;
+
FactoryMap<nntrainer::Layer> factory_map;
template <typename Args, typename T> struct isSupportedHelper;
- // map to store initialized opencl::Kernel
- OclKernelMap ocl_kernel_map;
+ // global map to store initialized opencl::Kernel
+ inline static OclKernelMap ocl_kernel_map;
/**
* @brief supportHelper to check if given type is supported within cl context
#ifdef ENABLE_OPENCL
if (compute_engine == ml::train::LayerComputeEngine::GPU) {
auto &cc = nntrainer::ClContext::Global();
+ cc.initBlasClKernels();
return createLayerNode(cc.createObject<nntrainer::Layer>(type), properties,
compute_engine);
}
#ifdef ENABLE_OPENCL
if (compute_engine == ml::train::LayerComputeEngine::GPU) {
auto &cc = nntrainer::ClContext::Global();
+ cc.initBlasClKernels();
return createLayerNode(cc.createObject<nntrainer::Layer>(type), properties,
compute_engine);
}
--- /dev/null
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2024 Debadri Samaddar <s.debadri@samsung.com>
+ *
+ * @file blas_kernel_strings.h
+ * @date 18 Sep 2024
+ * @brief All blas OpenCL kernel strings
+ * @see https://github.com/nnstreamer/nntrainer
+ * @author Debadri Samaddar <s.debadri@samsung.com>
+ * @bug No known bugs except for NYI items
+ *
+ */
+
+#ifndef __BLAS_KERNEL_STRINGS_H__
+#define __BLAS_KERNEL_STRINGS_H__
+
+#include <string>
+
+namespace nntrainer {
+static const std::string sgemv_cl_kernel_ =
+ R"(__kernel void sgemv_cl(const __global float* A, const __global float* X,
+ __global float* Y, unsigned int N, unsigned int lda) {
+ unsigned int i;
+ i = get_global_id(0);
+ float y0 = 0.0f;
+ for (unsigned int j = 0; j < N; j++)
+ y0 += A[i + j * lda] * X[j];
+ Y[i] = y0;
+
+ })";
+
+static const std::string dot_cl_kernel_ =
+ R"(__kernel void dot_cl(const __global float* A, const __global float* X, unsigned int K, __global float* res) {
+ *res = 0;
+ for (unsigned int i = 0; i < K; i++){
+ *res += A[i] * X[i];
+ }
+ })";
+
+static const std::string sgemm_cl_noTrans_kernel_ =
+ R"(__kernel void sgemm_cl_noTrans(const __global float* A, const __global float* B,
+ __global float* C, unsigned int K, unsigned int lda, unsigned int ldb, unsigned int ldc) {
+
+ unsigned int m = get_global_id(0);
+ unsigned int n = get_global_id(1);
+ float c = 0.0f;
+ for (unsigned int k = 0; k < K; ++k) {
+ float a, b;
+ a = A[m * lda + k];
+ b = B[k * ldb + n];
+ c += a * b;
+ }
+ C[m * ldc + n] = c;
+ })";
+
+static const std::string sgemm_cl_transA_kernel_ =
+ R"(__kernel void sgemm_cl_transA(const __global float* A, const __global float* B,
+ __global float* C, unsigned int K, unsigned int lda, unsigned int ldb, unsigned int ldc) {
+
+ unsigned int m = get_global_id(0);
+ unsigned int n = get_global_id(1);
+ float c = 0.0f;
+ for (unsigned int k = 0; k < K; ++k) {
+ float a, b;
+ a = A[k * lda + m];
+ b = B[k * ldb + n];
+ c += a * b;
+ }
+ C[m * ldc + n] = c;
+ })";
+
+static const std::string sgemm_cl_transB_kernel_ =
+ R"(__kernel void sgemm_cl_transB(const __global float *A, const __global float *B,
+ __global float *C, unsigned int K,
+ unsigned int lda, unsigned int ldb,
+ unsigned int ldc) {
+
+ unsigned int m = get_global_id(0);
+ unsigned int n = get_global_id(1);
+ float c = 0.0f;
+ for (unsigned int k = 0; k < K; ++k) {
+ float a, b;
+ a = A[m * lda + k];
+ b = B[n * ldb + k];
+ c += a * b;
+ }
+ C[m * ldc + n] = c;
+ })";
+
+static const std::string sgemm_cl_transAB_kernel_ =
+ R"(__kernel void sgemm_cl_transAB(const __global float *A, const __global float *B,
+ __global float *C, unsigned int K,
+ unsigned int lda, unsigned int ldb,
+ unsigned int ldc) {
+
+ unsigned int m = get_global_id(0);
+ unsigned int n = get_global_id(1);
+ float c = 0.0f;
+ for (unsigned int k = 0; k < K; ++k) {
+ float a, b;
+ a = A[k * lda + m];
+ b = B[n * ldb + k];
+ c += a * b;
+ }
+ C[m * ldc + n] = c;
+ })";
+
+static const std::string addition_cl_kernel_ =
+ R"(__kernel void addition_cl(__global const float* input, __global float* output, const unsigned int size) {
+ #pragma printf_support
+ size_t idx = get_global_id(0);
+ if (idx < size) {
+ output[idx] = output[idx] + input[idx];
+ }
+ })";
+
+static const std::string sscal_cl_kernel_ =
+ R"(__kernel void sscal_cl(__global float* X, const float alpha) {
+
+ unsigned int i = get_global_id(0);
+ X[i] *= alpha;
+ })";
+
+#ifdef ENABLE_FP16
+static const std::string sgemv_cl_kernel_fp16_ =
+ R"(
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+ __kernel void sgemv_cl_fp16(const __global half* A, const __global half* X,
+ __global half* Y, unsigned int N, unsigned int lda) {
+ unsigned int i;
+ i = get_global_id(0);
+ half y0 = 0.0f;
+ for (unsigned int j = 0; j < N; j++)
+ y0 += A[i + j * lda] * X[j];
+ Y[i] = y0;
+
+ })";
+
+static const std::string dot_cl_kernel_fp16_ =
+ R"(
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+ __kernel void dot_cl_fp16(const __global half* A, const __global half* X, unsigned int K, __global half* res) {
+ *res = 0;
+ for (unsigned int i = 0; i < K; i++){
+ *res += A[i] * X[i];
+ }
+ })";
+
+static const std::string sgemm_cl_noTrans_kernel_fp16_ =
+ R"(
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+ __kernel void sgemm_cl_noTrans_fp16(const __global half* A, const __global half* B,
+ __global half* C, unsigned int K, unsigned int lda, unsigned int ldb, unsigned int ldc) {
+
+ unsigned int m = get_global_id(0);
+ unsigned int n = get_global_id(1);
+ half c = 0.0f;
+ for (unsigned int k = 0; k < K; ++k) {
+ half a, b;
+ a = A[m * lda + k];
+ b = B[k * ldb + n];
+ c += a * b;
+ }
+ C[m * ldc + n] = c;
+ })";
+
+static const std::string sgemm_cl_transA_kernel_fp16_ =
+ R"(
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+ __kernel void sgemm_cl_transA_fp16(const __global half* A, const __global half* B,
+ __global half* C, unsigned int K, unsigned int lda, unsigned int ldb, unsigned int ldc) {
+
+ unsigned int m = get_global_id(0);
+ unsigned int n = get_global_id(1);
+ half c = 0.0f;
+ for (unsigned int k = 0; k < K; ++k) {
+ half a, b;
+ a = A[k * lda + m];
+ b = B[k * ldb + n];
+ c += a * b;
+ }
+ C[m * ldc + n] = c;
+ })";
+
+static const std::string sgemm_cl_transB_kernel_fp16_ =
+ R"(
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+ __kernel void sgemm_cl_transB_fp16(const __global half* A, const __global half* B,
+ __global half* C, unsigned int K, unsigned int lda, unsigned int ldb, unsigned int ldc) {
+
+ unsigned int m = get_global_id(0);
+ unsigned int n = get_global_id(1);
+ half c = 0.0f;
+ for (unsigned int k = 0; k < K; ++k) {
+ half a, b;
+ a = A[m * lda + k];
+ b = B[n * ldb + k];
+ c += a * b;
+ }
+ C[m * ldc + n] = c;
+ })";
+
+static const std::string sgemm_cl_transAB_kernel_fp16_ =
+ R"(
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+ __kernel void sgemm_cl_transAB_fp16(const __global half* A, const __global half* B,
+ __global half* C, unsigned int K, unsigned int lda, unsigned int ldb, unsigned int ldc) {
+
+ unsigned int m = get_global_id(0);
+ unsigned int n = get_global_id(1);
+ half c = 0.0f;
+ for (unsigned int k = 0; k < K; ++k) {
+ half a, b;
+ a = A[k * lda + m];
+ b = B[n * ldb + k];
+ c += a * b;
+ }
+ C[m * ldc + n] = c;
+ })";
+
+static const std::string addition_cl_kernel_fp16_ =
+ R"(
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+ __kernel void addition_cl_fp16(__global const half* input, __global half* output, const unsigned int size) {
+ size_t idx = get_global_id(0);
+ if (idx < size) {
+ output[idx] = output[idx] + input[idx];
+ }
+ })";
+
+static const std::string sscal_cl_kernel_fp16_ =
+ R"(
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+ __kernel void sscal_cl_fp16(__global half* X, const float alpha) {
+
+ unsigned int i = get_global_id(0);
+ X[i] *= alpha;
+ })";
+#endif
+} // namespace nntrainer
+#endif /* __BLAS_KERNEL_INTERFACE_H__ */
*
*/
+#include <blas_kernel_strings.h>
#include <blas_kernels.h>
namespace nntrainer {
-// get global cl_context to use in kernels
-ClContext &cl_context_ref = ClContext::Global();
-
-std::string sgemv_cl_kernel_ =
- R"(__kernel void sgemv_cl(const __global float* A, const __global float* X,
- __global float* Y, unsigned int N, unsigned int lda) {
- unsigned int i;
- i = get_global_id(0);
- float y0 = 0.0f;
- for (unsigned int j = 0; j < N; j++)
- y0 += A[i + j * lda] * X[j];
- Y[i] = y0;
-
- })";
-
-std::string dot_cl_kernel_ =
- R"(__kernel void dot_cl(const __global float* A, const __global float* X, unsigned int K, __global float* res) {
- *res = 0;
- for (unsigned int i = 0; i < K; i++){
- *res += A[i] * X[i];
- }
- })";
-
-std::string sgemm_cl_noTrans_kernel_ =
- R"(__kernel void sgemm_cl_noTrans(const __global float* A, const __global float* B,
- __global float* C, unsigned int K, unsigned int lda, unsigned int ldb, unsigned int ldc) {
-
- unsigned int m = get_global_id(0);
- unsigned int n = get_global_id(1);
- float c = 0.0f;
- for (unsigned int k = 0; k < K; ++k) {
- float a, b;
- a = A[m * lda + k];
- b = B[k * ldb + n];
- c += a * b;
- }
- C[m * ldc + n] = c;
- })";
-
-std::string sgemm_cl_transA_kernel_ =
- R"(__kernel void sgemm_cl_transA(const __global float* A, const __global float* B,
- __global float* C, unsigned int K, unsigned int lda, unsigned int ldb, unsigned int ldc) {
-
- unsigned int m = get_global_id(0);
- unsigned int n = get_global_id(1);
- float c = 0.0f;
- for (unsigned int k = 0; k < K; ++k) {
- float a, b;
- a = A[k * lda + m];
- b = B[k * ldb + n];
- c += a * b;
- }
- C[m * ldc + n] = c;
- })";
-
-std::string sgemm_cl_transB_kernel_ =
- R"(__kernel void sgemm_cl_transB(const __global float *A, const __global float *B,
- __global float *C, unsigned int K,
- unsigned int lda, unsigned int ldb,
- unsigned int ldc) {
-
- unsigned int m = get_global_id(0);
- unsigned int n = get_global_id(1);
- float c = 0.0f;
- for (unsigned int k = 0; k < K; ++k) {
- float a, b;
- a = A[m * lda + k];
- b = B[n * ldb + k];
- c += a * b;
- }
- C[m * ldc + n] = c;
- })";
-
-std::string sgemm_cl_transAB_kernel_ =
- R"(__kernel void sgemm_cl_transAB(const __global float *A, const __global float *B,
- __global float *C, unsigned int K,
- unsigned int lda, unsigned int ldb,
- unsigned int ldc) {
-
- unsigned int m = get_global_id(0);
- unsigned int n = get_global_id(1);
- float c = 0.0f;
- for (unsigned int k = 0; k < K; ++k) {
- float a, b;
- a = A[k * lda + m];
- b = B[n * ldb + k];
- c += a * b;
- }
- C[m * ldc + n] = c;
- })";
-
-std::string addition_cl_kernel_ =
- R"(__kernel void addition_cl(__global const float* input, __global float* output, const unsigned int size) {
- #pragma printf_support
- size_t idx = get_global_id(0);
- if (idx < size) {
- output[idx] = output[idx] + input[idx];
- }
- })";
-
-std::string sscal_cl_kernel_ =
- R"(__kernel void sscal_cl(__global float* X, const float alpha) {
-
- unsigned int i = get_global_id(0);
- X[i] *= alpha;
- })";
-
void sgemv_cl(const float *matAdata, const float *vecXdata, float *vecYdata,
unsigned int dim1, unsigned int dim2, unsigned int lda) {
namespace nntrainer {
// get global cl_context to use in kernels
-extern ClContext &cl_context_ref;
+static ClContext cl_context_ref;
/**
* @brief sgemv computation : Y = A*X + Y
*
*/
+#include <blas_kernel_strings.h>
#include <blas_kernels.h>
namespace nntrainer {
-std::string sgemv_cl_kernel_fp16_ =
- R"(
- #pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
- __kernel void sgemv_cl_fp16(const __global half* A, const __global half* X,
- __global half* Y, unsigned int N, unsigned int lda) {
- unsigned int i;
- i = get_global_id(0);
- half y0 = 0.0f;
- for (unsigned int j = 0; j < N; j++)
- y0 += A[i + j * lda] * X[j];
- Y[i] = y0;
-
- })";
-
-std::string dot_cl_kernel_fp16_ =
- R"(
- #pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
- __kernel void dot_cl_fp16(const __global half* A, const __global half* X, unsigned int K, __global half* res) {
- *res = 0;
- for (unsigned int i = 0; i < K; i++){
- *res += A[i] * X[i];
- }
- })";
-
-std::string sgemm_cl_noTrans_kernel_fp16_ =
- R"(
- #pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
- __kernel void sgemm_cl_noTrans_fp16(const __global half* A, const __global half* B,
- __global half* C, unsigned int K, unsigned int lda, unsigned int ldb, unsigned int ldc) {
-
- unsigned int m = get_global_id(0);
- unsigned int n = get_global_id(1);
- half c = 0.0f;
- for (unsigned int k = 0; k < K; ++k) {
- half a, b;
- a = A[m * lda + k];
- b = B[k * ldb + n];
- c += a * b;
- }
- C[m * ldc + n] = c;
- })";
-
-std::string sgemm_cl_transA_kernel_fp16_ =
- R"(
- #pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
- __kernel void sgemm_cl_transA_fp16(const __global half* A, const __global half* B,
- __global half* C, unsigned int K, unsigned int lda, unsigned int ldb, unsigned int ldc) {
-
- unsigned int m = get_global_id(0);
- unsigned int n = get_global_id(1);
- half c = 0.0f;
- for (unsigned int k = 0; k < K; ++k) {
- half a, b;
- a = A[k * lda + m];
- b = B[k * ldb + n];
- c += a * b;
- }
- C[m * ldc + n] = c;
- })";
-
-std::string sgemm_cl_transB_kernel_fp16_ =
- R"(
- #pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
- __kernel void sgemm_cl_transB_fp16(const __global half* A, const __global half* B,
- __global half* C, unsigned int K, unsigned int lda, unsigned int ldb, unsigned int ldc) {
-
- unsigned int m = get_global_id(0);
- unsigned int n = get_global_id(1);
- half c = 0.0f;
- for (unsigned int k = 0; k < K; ++k) {
- half a, b;
- a = A[m * lda + k];
- b = B[n * ldb + k];
- c += a * b;
- }
- C[m * ldc + n] = c;
- })";
-
-std::string sgemm_cl_transAB_kernel_fp16_ =
- R"(
- #pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
- __kernel void sgemm_cl_transAB_fp16(const __global half* A, const __global half* B,
- __global half* C, unsigned int K, unsigned int lda, unsigned int ldb, unsigned int ldc) {
-
- unsigned int m = get_global_id(0);
- unsigned int n = get_global_id(1);
- half c = 0.0f;
- for (unsigned int k = 0; k < K; ++k) {
- half a, b;
- a = A[k * lda + m];
- b = B[n * ldb + k];
- c += a * b;
- }
- C[m * ldc + n] = c;
- })";
-
-std::string addition_cl_kernel_fp16_ =
- R"(
- #pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
- __kernel void addition_cl_fp16(__global const half* input, __global half* output, const unsigned int size) {
- size_t idx = get_global_id(0);
- if (idx < size) {
- output[idx] = output[idx] + input[idx];
- }
- })";
-
-std::string sscal_cl_kernel_fp16_ =
- R"(
- #pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
- __kernel void sscal_cl_fp16(__global half* X, const float alpha) {
-
- unsigned int i = get_global_id(0);
- X[i] *= alpha;
- })";
-
void sgemv_cl(const __fp16 *matAdata, const __fp16 *vecXdata, __fp16 *vecYdata,
unsigned int dim1, unsigned int dim2, unsigned int lda) {
cl_op_headers = [
'blas_kernel_interface.h',
+ 'blas_kernel_strings.h',
]
if get_option('enable-fp16')
using namespace nntrainer;
-static RunLayerContext setUpGpuContext() {
-
+static void setUpGpuContext() {
auto &ac = nntrainer::ClContext::Global();
- auto rc = RunLayerContext();
-
- return rc;
+ ac.initBlasClKernels();
}
TEST(blas_kernels, dotCL_sgemv) {
- RunLayerContext rc = setUpGpuContext();
-
+ setUpGpuContext();
int batch = 1;
int channel = 1;
int height = 1;
MOD) *
alpha);
- nntrainer::Tensor C = dotCl(A_fp32, B_fp32, rc, transA, transB);
+ nntrainer::Tensor C = dotCl(A_fp32, B_fp32, transA, transB);
nntrainer::Tensor C_fp32 = A_fp32.dot(B_fp32, transA, transB);
float mseErrorNeon =
}
TEST(blas_kernels, dotCL_sgemv_n) {
- RunLayerContext rc = setUpGpuContext();
int batch = 1;
int channel = 1;
MOD) *
alpha);
- EXPECT_THROW(dotCl(A_fp32, B_fp32, rc, transA, transB), std::runtime_error);
+ EXPECT_THROW(dotCl(A_fp32, B_fp32, transA, transB), std::runtime_error);
}
TEST(nntrainer_Tensor, multiply_i) {
- RunLayerContext rc = setUpGpuContext();
int batch = 1;
int channel = 1;
k * (width)*alpha + l + 1);
// fp16
- multiplyCl(input, 0.1, rc);
+ multiplyCl(input, 0.1);
// fp32
- multiplyCl(input_fp32, 0.1, rc);
+ multiplyCl(input_fp32, 0.1);
float mseErrorNeon = mse<__fp16>(input.getData<__fp16>(),
input_fp32.getData<float>(), input.size());
TEST(nntrainer_Tensor, dot_gemm_50_768_1024_noTrans) {
/// @note GEMM : A X B = C
- RunLayerContext rc = setUpGpuContext();
int batch = 1;
int channel = 1;
MOD) *
alpha);
- nntrainer::Tensor C = dotCl(A_fp32, B_fp32, rc, transA, transB);
+ nntrainer::Tensor C = dotCl(A_fp32, B_fp32, transA, transB);
nntrainer::Tensor C_fp32 = A_fp32.dot(B_fp32, transA, transB);
float mseErrorNeon =
TEST(nntrainer_Tensor, dot_gemm_50_768_2048_transB) {
/// @note GEMM : A X B = C
- RunLayerContext rc = setUpGpuContext();
int batch = 1;
int channel = 1;
MOD) *
alpha);
- nntrainer::Tensor C = dotCl(A_fp32, B_fp32, rc, transA, transB);
+ nntrainer::Tensor C = dotCl(A_fp32, B_fp32, transA, transB);
nntrainer::Tensor C_fp32 = A_fp32.dot(B_fp32, transA, transB);
float mseErrorNeon =
TEST(nntrainer_Tensor, dot_gemm_50_768_1024_transA) {
/// @note GEMM : A X B = C
- RunLayerContext rc = setUpGpuContext();
int batch = 1;
int channel = 1;
MOD) *
alpha);
- nntrainer::Tensor C = dotCl(A_fp32, B_fp32, rc, transA, transB);
+ nntrainer::Tensor C = dotCl(A_fp32, B_fp32, transA, transB);
nntrainer::Tensor C_fp32 = A_fp32.dot(B_fp32, transA, transB);
float mseErrorNeon =
TEST(nntrainer_Tensor, dot_gemm_50_768_2048_transAB) {
/// @note GEMM : A X B = C
- RunLayerContext rc = setUpGpuContext();
int batch = 1;
int channel = 1;
MOD) *
alpha);
- nntrainer::Tensor C = dotCl(A_fp32, B_fp32, rc, transA, transB);
+ nntrainer::Tensor C = dotCl(A_fp32, B_fp32, transA, transB);
nntrainer::Tensor C_fp32 = A_fp32.dot(B_fp32, transA, transB);
float mseErrorNeon =