- Current implementation is rooted on general cases, thus optimize only w.r.t. K accumulation.
- However, when it comes to M,1 x 1,N computation, all optimizations like packing, transposing is no use.
- Implementing a explicit kernel function for such case resolved the latency issue.
**Self evaluation:**
1. Build test: [X]Passed [ ]Failed [ ]Skipped
2. Run test: [X]Passed [ ]Failed [ ]Skipped
Signed-off-by: skykongkong8 <ss.kong@samsung.com>
void hgemm(const __fp16 *A, const __fp16 *B, __fp16 *C, uint32_t M, uint32_t N,
uint32_t K, float alpha, float beta, bool TransA, bool TransB) {
-
+ if (K == 1) {
+ unsigned int lda = (TransA) ? M : K;
+ unsigned int ldb = (TransB) ? K : N;
+ return hgemm_K1(M, N, K, A, lda, B, ldb, C, N, alpha, beta);
+ }
// dynamic creation to avoid reaching stack limit(causes segmentation fault)
float *C32 = (float *)malloc(M * N * sizeof(float));
free(B8);
}
+void hgemm_K1(unsigned int M, unsigned int N, unsigned int K, const __fp16 *A,
+ unsigned int lda, const __fp16 *B, unsigned int ldb, __fp16 *C,
+ unsigned int ldc, float alpha, float beta) {
+ float16x8_t a_vec;
+ unsigned int N8 = (N >> 3) << 3;
+ for (unsigned int m = 0; m < M; ++m) {
+ a_vec = vmovq_n_f16(A[m]);
+ for (unsigned int n = 0; n < N8; n += 8) {
+ vst1q_f16(&C[m * ldc + n], vmulq_f16(a_vec, vld1q_f16(&B[n])));
+ }
+ for (unsigned int n = N8; n < N; ++n) {
+ C[m * ldc + n] = A[m] * B[n];
+ }
+ }
+}
+
void hgemm_noTrans_1x4(unsigned int M, unsigned int N, unsigned int K,
const __fp16 *A, unsigned int lda, const __fp16 *B,
unsigned int ldb, __fp16 *C, unsigned int ldc,
unsigned int ldb, float *C, unsigned int ldc,
float alpha = 1.F, float beta = 0.F);
+/**
+ * @brief hgemm fallback with neon : Y = alpha*op(A)*op(B) + beta*C,
+ * @param M length of the row of matrix A
+ * @param N length of the col of matrix B
+ * @param K length of the col of matrix A
+ * @param A input matrix A
+ * @param lda length of the col of matrix A
+ * @param B input matrix B
+ * @param ldb length of the col of matrix B
+ * @param C output matrix C
+ * @param ldc length of the col of matrix C
+ * @param[in] alpha float number
+ * @param[in] beta float number
+ */
+void hgemm_K1(unsigned int M, unsigned int N, unsigned int K,
+ const __fp16 *A, unsigned int lda, const __fp16 *B,
+ unsigned int ldb, __fp16 *C, unsigned int ldc,
+ float alpha = 1.F, float beta = 0.F);
+
/**
* @brief hgemm noTrans computation with 1x4 kernel : C = A*B,
*