From: skykongkong8 Date: Thu, 20 Jun 2024 11:17:47 +0000 (+0900) Subject: [ hgemm ] Use aligned memory allocation in transpose / padding gemm X-Git-Tag: accepted/tizen/7.0/unified/20240830.164841~88 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=3364a8889f400867a068097c1f82f454c9900c53;p=platform%2Fcore%2Fml%2Fnntrainer.git [ hgemm ] Use aligned memory allocation in transpose / padding gemm - Using unaligned memory may invoke SIGSEGV **Self evaluation:** 1. Build test: [X]Passed [ ]Failed [ ]Skipped 2. Run test: [X]Passed [ ]Failed [ ]Skipped Signed-off-by: skykongkong8 --- diff --git a/nntrainer/tensor/hgemm/hgemm.cpp b/nntrainer/tensor/hgemm/hgemm.cpp index c8a31f21..2b48a630 100644 --- a/nntrainer/tensor/hgemm/hgemm.cpp +++ b/nntrainer/tensor/hgemm/hgemm.cpp @@ -66,8 +66,9 @@ void hgemm_noTrans_strict(const __fp16 *A, const __fp16 *B, float *C32, } } -void hgemm_noTrans_strict(const __fp16 *A, const __fp16 *B, __fp16 *C, unsigned int M, - unsigned int N, unsigned int K, float alpha, float beta) { +void hgemm_noTrans_strict(const __fp16 *A, const __fp16 *B, __fp16 *C, + unsigned int M, unsigned int N, unsigned int K, + float alpha, float beta) { if (alpha == 1.F) { // used bitwise operator instead of modulo for performance // e.g (M % 8) is same as (M & 0x7) which will extract last 3 bits of M @@ -96,8 +97,8 @@ void hgemm_noTrans_padding_wrt_K(const __fp16 *A, const __fp16 *B, float *C, const unsigned int lda = K; const unsigned int ldb = N; - __fp16 *A8 = new __fp16[M * K8_high]; - __fp16 *B8 = new __fp16[K8_high * N]; + __fp16 *A8 = alignedMalloc(M * K8_high); + __fp16 *B8 = alignedMalloc(K8_high * N); float16x8_t ZEROS = vmovq_n_f16(0.F); @@ -829,7 +830,7 @@ void hgemm_noTrans_8x16(unsigned int M, unsigned int N, unsigned int K, const __fp16 *A, unsigned int lda, const __fp16 *B, unsigned int ldb, __fp16 *C, unsigned int ldc, float alpha, float beta) { -// M, N, K is full M, N, K here + // M, N, K is full M, N, K here __fp16 *sA = alignedMalloc(M * K); __fp16 *sB = alignedMalloc(K * N); @@ -1257,7 +1258,7 @@ void hgemm_noTrans_fallback(unsigned int M, unsigned int N, unsigned int K, void hgemm_transB(const __fp16 *A, const __fp16 *B, float *C, unsigned int M, unsigned int N, unsigned int K, float alpha, float beta) { - __fp16 *B_T = new __fp16[K * N]; + __fp16 *B_T = alignedMalloc(K * N); transpose_neon<__fp16>(N, K, B, K, B_T, N); @@ -1268,7 +1269,7 @@ void hgemm_transB(const __fp16 *A, const __fp16 *B, float *C, unsigned int M, void hgemm_transA(const __fp16 *A, const __fp16 *B, float *C, unsigned int M, unsigned int N, unsigned int K, float alpha, float beta) { - __fp16 *A_T = new __fp16[M * K]; + __fp16 *A_T = alignedMalloc(M * K); transpose_neon<__fp16>(K, M, A, M, A_T, K); @@ -1279,8 +1280,8 @@ void hgemm_transA(const __fp16 *A, const __fp16 *B, float *C, unsigned int M, void hgemm_transAB(const __fp16 *A, const __fp16 *B, float *C, unsigned int M, unsigned int N, unsigned int K, float alpha, float beta) { - __fp16 *A_T = new __fp16[M * K]; - __fp16 *B_T = new __fp16[K * N]; + __fp16 *A_T = alignedMalloc(M * K); + __fp16 *B_T = alignedMalloc(K * N); transpose_neon<__fp16>(K, M, A, M, A_T, K); transpose_neon<__fp16>(N, K, B, K, B_T, N);