std::string sgemm_cl_kernel_ =
R"(__kernel void sgemm_cl(const __global float* A, const __global float* B,
- __global float* C, unsigned int M, unsigned int N, unsigned int K, unsigned int lda, unsigned int ldb, unsigned int ldc) {
+ __global float* C, unsigned int K, unsigned int lda, unsigned int ldb, unsigned int ldc) {
unsigned int m = get_global_id(0);
- for (unsigned int n = 0; n < N; ++n) {
- float c = 0.0f;
- for (unsigned int k = 0; k < K; ++k) {
- float a, b;
- a = A[m * lda + k];
- b = B[k * ldb + n];
- c += a * b;
- }
- C[m * ldc + n] = c;
+ unsigned int n = get_global_id(1);
+ float c = 0.0f;
+ for (unsigned int k = 0; k < K; ++k) {
+ float a, b;
+ a = A[m * lda + k];
+ b = B[k * ldb + n];
+ c += a * b;
}
+ C[m * ldc + n] = c;
})";
/**
size_t dim1_size = sizeof(float) * dim1;
size_t dim2_size = sizeof(float) * dim2;
- opencl::Buffer inputA(context.context_inst_, dim1_size * dim2_size, true,
- nullptr);
+ opencl::Buffer inputA(context.context_inst_, dim1 * dim2 * sizeof(float),
+ true, nullptr);
opencl::Buffer inputX(context.context_inst_, dim1_size, true, nullptr);
break;
}
- const int work_groups_count[3] = {(int)dim1, 1, 1};
+ const int work_groups_count[3] = {(int)dim2, 1, 1};
const int work_group_size[3] = {32, 32, 1}; // test-value
result = context.command_queue_inst_.DispatchCommand(
} while (false);
}
-float dot_cl(const float *matAdata, const float *vecXdata, unsigned int dim1,
+float dot_cl(const float *vecAdata, const float *vecXdata, unsigned int dim1,
RunLayerContext &context) {
bool result = false;
opencl::Buffer dotResult(context.context_inst_, sizeof(float), true,
&cl_ret);
- result = inputA.WriteData(context.command_queue_inst_, matAdata);
+ result = inputA.WriteData(context.command_queue_inst_, vecAdata);
if (!result) {
break;
}
break;
}
- size_t m_size = sizeof(float) * M;
- size_t n_size = sizeof(float) * N;
- size_t k_size = sizeof(float) * K;
- opencl::Buffer inputA(context.context_inst_, m_size * k_size, true,
- nullptr);
+ size_t m_k_size = M * K * sizeof(float);
+ size_t k_n_size = K * N * sizeof(float);
+ size_t m_n_size = M * N * sizeof(float);
+
+ opencl::Buffer inputA(context.context_inst_, m_k_size, true, nullptr);
- opencl::Buffer inputB(context.context_inst_, k_size * n_size, true,
- nullptr);
+ opencl::Buffer inputB(context.context_inst_, k_n_size, true, nullptr);
- opencl::Buffer inOutC(context.context_inst_, m_size * n_size, true,
- nullptr);
+ opencl::Buffer inOutC(context.context_inst_, m_n_size, true, nullptr);
result = inputA.WriteData(context.command_queue_inst_, A);
if (!result) {
break;
}
- result = kernel_sgemm.SetKernelArguments(3, &M, sizeof(int));
- if (!result) {
- break;
- }
-
- result = kernel_sgemm.SetKernelArguments(4, &N, sizeof(int));
- if (!result) {
- break;
- }
-
- result = kernel_sgemm.SetKernelArguments(5, &K, sizeof(int));
+ result = kernel_sgemm.SetKernelArguments(3, &K, sizeof(int));
if (!result) {
break;
}
- result = kernel_sgemm.SetKernelArguments(6, &lda, sizeof(int));
+ result = kernel_sgemm.SetKernelArguments(4, &lda, sizeof(int));
if (!result) {
break;
}
- result = kernel_sgemm.SetKernelArguments(7, &ldb, sizeof(int));
+ result = kernel_sgemm.SetKernelArguments(5, &ldb, sizeof(int));
if (!result) {
break;
}
- result = kernel_sgemm.SetKernelArguments(8, &ldc, sizeof(int));
+ result = kernel_sgemm.SetKernelArguments(6, &ldc, sizeof(int));
if (!result) {
break;
}
- const int work_groups_count[3] = {(int)M, 1, 1};
+ const int work_groups_count[3] = {(int)M, (int)N, 1};
const int work_group_size[3] = {32, 32, 1}; // test-value
result = context.command_queue_inst_.DispatchCommand(
* @param[in] matAdata float * for Matrix A
* @param[in] vecXdata float * for Vector X
* @param[in] vecYdata float * for Vector Y
- * @param[in] dim1 number of A's row
- * @param[in] dim2 number of X's columns
+ * @param[in] dim1 number of A's columns
+ * @param[in] dim2 number of A's rows
* @param[in] lda number of X's columns
* @param[in] context RunLayerContext reference
*/
/**
* @brief dot computation : sum of all X * Y
- * @param[in] matAdata float * for Vector A
+ * @param[in] vecAdata float * for Vector A
* @param[in] vecXdata float * for Vector X
* @param[in] dim1 number of elements in both input vectors
* @param[in] context RunLayerContext reference
*/
-float dot_cl(const float *matAdata, const float *vecXdata, unsigned int dim1,
+float dot_cl(const float *vecAdata, const float *vecXdata, unsigned int dim1,
RunLayerContext &context);
/**