From 6022e5629c7708b114a3c2387e652ebd32122300 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Fri, 28 Aug 2020 22:36:36 +0800 Subject: [PATCH] Refs #2587 fix small matrix c/zgemm bug. --- common_level3.h | 150 +++++++++++------------ interface/gemm.c | 22 ++-- kernel/generic/zgemm_small_matrix_kernel_b0_nn.c | 6 +- kernel/generic/zgemm_small_matrix_kernel_b0_nt.c | 6 +- kernel/generic/zgemm_small_matrix_kernel_b0_tn.c | 6 +- kernel/generic/zgemm_small_matrix_kernel_b0_tt.c | 6 +- kernel/generic/zgemm_small_matrix_kernel_nn.c | 10 +- kernel/generic/zgemm_small_matrix_kernel_nt.c | 10 +- kernel/generic/zgemm_small_matrix_kernel_tn.c | 10 +- kernel/generic/zgemm_small_matrix_kernel_tt.c | 10 +- 10 files changed, 116 insertions(+), 120 deletions(-) diff --git a/common_level3.h b/common_level3.h index 5741f56..a3a487d 100644 --- a/common_level3.h +++ b/common_level3.h @@ -536,85 +536,85 @@ int dgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLA int dgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); int dgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); -int cgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); -int cgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); -int cgemm_small_kernel_nr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); -int cgemm_small_kernel_nc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); +int cgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); +int cgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); +int cgemm_small_kernel_nr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); +int cgemm_small_kernel_nc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); -int cgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); -int cgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); -int cgemm_small_kernel_tr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); -int cgemm_small_kernel_tc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); - -int cgemm_small_kernel_rn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); -int cgemm_small_kernel_rt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); -int cgemm_small_kernel_rr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); -int cgemm_small_kernel_rc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); - -int cgemm_small_kernel_cn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); -int cgemm_small_kernel_ct(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); -int cgemm_small_kernel_cr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); -int cgemm_small_kernel_cc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); - -int zgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); -int zgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); -int zgemm_small_kernel_nr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); -int zgemm_small_kernel_nc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); +int cgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); +int cgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); +int cgemm_small_kernel_tr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); +int cgemm_small_kernel_tc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + +int cgemm_small_kernel_rn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); +int cgemm_small_kernel_rt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); +int cgemm_small_kernel_rr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); +int cgemm_small_kernel_rc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + +int cgemm_small_kernel_cn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); +int cgemm_small_kernel_ct(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); +int cgemm_small_kernel_cr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); +int cgemm_small_kernel_cc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + +int zgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); +int zgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); +int zgemm_small_kernel_nr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); +int zgemm_small_kernel_nc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); -int zgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); -int zgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); -int zgemm_small_kernel_tr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); -int zgemm_small_kernel_tc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); - -int zgemm_small_kernel_rn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); -int zgemm_small_kernel_rt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); -int zgemm_small_kernel_rr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); -int zgemm_small_kernel_rc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); - -int zgemm_small_kernel_cn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); -int zgemm_small_kernel_ct(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); -int zgemm_small_kernel_cr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); -int zgemm_small_kernel_cc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); - -int cgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); -int cgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); -int cgemm_small_kernel_b0_nr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); -int cgemm_small_kernel_b0_nc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int zgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); +int zgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); +int zgemm_small_kernel_tr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); +int zgemm_small_kernel_tc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + +int zgemm_small_kernel_rn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); +int zgemm_small_kernel_rt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); +int zgemm_small_kernel_rr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); +int zgemm_small_kernel_rc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + +int zgemm_small_kernel_cn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); +int zgemm_small_kernel_ct(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); +int zgemm_small_kernel_cr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); +int zgemm_small_kernel_cc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + +int cgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_nr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_nc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); -int cgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); -int cgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); -int cgemm_small_kernel_b0_tr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); -int cgemm_small_kernel_b0_tc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); - -int cgemm_small_kernel_b0_rn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); -int cgemm_small_kernel_b0_rt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); -int cgemm_small_kernel_b0_rr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); -int cgemm_small_kernel_b0_rc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); - -int cgemm_small_kernel_b0_cn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); -int cgemm_small_kernel_b0_ct(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); -int cgemm_small_kernel_b0_cr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); -int cgemm_small_kernel_b0_cc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); - -int zgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); -int zgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); -int zgemm_small_kernel_b0_nr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); -int zgemm_small_kernel_b0_nc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int cgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_tr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_tc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + +int cgemm_small_kernel_b0_rn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_rt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_rr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_rc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + +int cgemm_small_kernel_b0_cn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_ct(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_cr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_cc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + +int zgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_nr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_nc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); -int zgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); -int zgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); -int zgemm_small_kernel_b0_tr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); -int zgemm_small_kernel_b0_tc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); - -int zgemm_small_kernel_b0_rn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); -int zgemm_small_kernel_b0_rt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); -int zgemm_small_kernel_b0_rr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); -int zgemm_small_kernel_b0_rc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); - -int zgemm_small_kernel_b0_cn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); -int zgemm_small_kernel_b0_ct(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); -int zgemm_small_kernel_b0_cr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); -int zgemm_small_kernel_b0_cc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_tr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_tc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + +int zgemm_small_kernel_b0_rn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_rt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_rr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_rc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + +int zgemm_small_kernel_b0_cn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_ct(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_cr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_cc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); #endif diff --git a/interface/gemm.c b/interface/gemm.c index b73baa9..7251993 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -106,47 +106,43 @@ static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, IFLOAT *, IFLOAT *, B }; #ifdef SMALL_MATRIX_OPT -//Only support s/dgemm small matrix optimiztion so far. + +#ifndef COMPLEX static int (*gemm_small_kernel[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT ,FLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG) = { #ifndef GEMM3M -#ifndef COMPLEX GEMM_SMALL_KERNEL_NN, GEMM_SMALL_KERNEL_TN, NULL, NULL, GEMM_SMALL_KERNEL_NT, GEMM_SMALL_KERNEL_TT, NULL, NULL, #endif -#endif }; static int (*gemm_small_kernel_b0[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG) = { #ifndef GEMM3M -#ifndef COMPLEX GEMM_SMALL_KERNEL_B0_NN, GEMM_SMALL_KERNEL_B0_TN, NULL, NULL, GEMM_SMALL_KERNEL_B0_NT, GEMM_SMALL_KERNEL_B0_TT, NULL, NULL, #endif -#endif }; -static int (*zgemm_small_kernel[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT *,FLOAT *, BLASLONG, FLOAT *, FLOAT *, BLASLONG) = { +#else + +static int (*zgemm_small_kernel[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG) = { #ifndef GEMM3M -#ifdef COMPLEX GEMM_SMALL_KERNEL_NN, GEMM_SMALL_KERNEL_TN, GEMM_SMALL_KERNEL_RN, GEMM_SMALL_KERNEL_CN, GEMM_SMALL_KERNEL_NT, GEMM_SMALL_KERNEL_TT, GEMM_SMALL_KERNEL_RT, GEMM_SMALL_KERNEL_CT, GEMM_SMALL_KERNEL_NR, GEMM_SMALL_KERNEL_TR, GEMM_SMALL_KERNEL_RR, GEMM_SMALL_KERNEL_CR, GEMM_SMALL_KERNEL_NC, GEMM_SMALL_KERNEL_TC, GEMM_SMALL_KERNEL_RC, GEMM_SMALL_KERNEL_CC, #endif -#endif }; -static int (*zgemm_small_kernel_b0[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG) = { +static int (*zgemm_small_kernel_b0[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG) = { #ifndef GEMM3M -#ifdef COMPLEX GEMM_SMALL_KERNEL_B0_NN, GEMM_SMALL_KERNEL_B0_TN, GEMM_SMALL_KERNEL_B0_RN, GEMM_SMALL_KERNEL_B0_CN, GEMM_SMALL_KERNEL_B0_NT, GEMM_SMALL_KERNEL_B0_TT, GEMM_SMALL_KERNEL_B0_RT, GEMM_SMALL_KERNEL_B0_CT, GEMM_SMALL_KERNEL_B0_NR, GEMM_SMALL_KERNEL_B0_TR, GEMM_SMALL_KERNEL_B0_RR, GEMM_SMALL_KERNEL_B0_CR, GEMM_SMALL_KERNEL_B0_NC, GEMM_SMALL_KERNEL_B0_TC, GEMM_SMALL_KERNEL_B0_RC, GEMM_SMALL_KERNEL_B0_CC, #endif -#endif }; #endif +#endif #ifndef CBLAS @@ -479,9 +475,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS } #else if(beta[0] == 0.0 && beta[1] == 0.0){ - (zgemm_small_kernel_b0[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, (FLOAT *)(args.alpha), args.b, args.ldb, args.c, args.ldc); + (zgemm_small_kernel_b0[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, alpha[0], alpha[1], args.b, args.ldb, args.c, args.ldc); }else{ - (zgemm_small_kernel[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, (FLOAT *)(args.alpha), args.b, args.ldb, (FLOAT *)(args.beta), args.c, args.ldc); + (zgemm_small_kernel[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, alpha[0], alpha[1], args.b, args.ldb, beta[0], beta[1], args.c, args.ldc); } #endif return; diff --git a/kernel/generic/zgemm_small_matrix_kernel_b0_nn.c b/kernel/generic/zgemm_small_matrix_kernel_b0_nn.c index 11e746e..3ab057f 100644 --- a/kernel/generic/zgemm_small_matrix_kernel_b0_nn.c +++ b/kernel/generic/zgemm_small_matrix_kernel_b0_nn.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) { FLOAT real, imag; @@ -65,8 +65,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* al #endif } - C[j*2*ldc + 2*i] = alpha[0]*real - alpha[1]*imag; - C[j*2*ldc+ 2*i + 1] = alpha[0]*imag + real*alpha[1]; + C[j*2*ldc + 2*i] = alpha0*real - alpha1*imag; + C[j*2*ldc+ 2*i + 1] = alpha0*imag + real*alpha1; } } diff --git a/kernel/generic/zgemm_small_matrix_kernel_b0_nt.c b/kernel/generic/zgemm_small_matrix_kernel_b0_nt.c index 1ef7430..dc35f4a 100644 --- a/kernel/generic/zgemm_small_matrix_kernel_b0_nt.c +++ b/kernel/generic/zgemm_small_matrix_kernel_b0_nt.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) { FLOAT real, imag; int i, j, l; @@ -68,8 +68,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* al #endif } - C[j*2*ldc + 2*i] = alpha[0]*real - alpha[1]*imag; - C[j*2*ldc+ 2*i + 1] = alpha[0]*imag + real*alpha[1]; + C[j*2*ldc + 2*i] = alpha0*real - alpha1*imag; + C[j*2*ldc+ 2*i + 1] = alpha0*imag + real*alpha1; } } diff --git a/kernel/generic/zgemm_small_matrix_kernel_b0_tn.c b/kernel/generic/zgemm_small_matrix_kernel_b0_tn.c index 2cd3ebc..479a56e 100644 --- a/kernel/generic/zgemm_small_matrix_kernel_b0_tn.c +++ b/kernel/generic/zgemm_small_matrix_kernel_b0_tn.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) { FLOAT real, imag; int i, j, l; @@ -68,8 +68,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* al #endif } - C[j*2*ldc + 2*i] = alpha[0]*real - alpha[1]*imag; - C[j*2*ldc+ 2*i + 1] = alpha[0]*imag + real*alpha[1]; + C[j*2*ldc + 2*i] = alpha0*real - alpha1*imag; + C[j*2*ldc+ 2*i + 1] = alpha0*imag + real*alpha1; } } diff --git a/kernel/generic/zgemm_small_matrix_kernel_b0_tt.c b/kernel/generic/zgemm_small_matrix_kernel_b0_tt.c index 25b05b4..b698973 100644 --- a/kernel/generic/zgemm_small_matrix_kernel_b0_tt.c +++ b/kernel/generic/zgemm_small_matrix_kernel_b0_tt.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) { FLOAT real, imag; int i, j, l; @@ -68,8 +68,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* al #endif } - C[j*2*ldc + 2*i] = alpha[0]*real - alpha[1]*imag; - C[j*2*ldc+ 2*i + 1] = alpha[0]*imag + real*alpha[1]; + C[j*2*ldc + 2*i] = alpha0*real - alpha1*imag; + C[j*2*ldc+ 2*i + 1] = alpha0*imag + real*alpha1; } } diff --git a/kernel/generic/zgemm_small_matrix_kernel_nn.c b/kernel/generic/zgemm_small_matrix_kernel_nn.c index 6ef1b96..4bf6bf7 100644 --- a/kernel/generic/zgemm_small_matrix_kernel_nn.c +++ b/kernel/generic/zgemm_small_matrix_kernel_nn.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* alpha, FLOAT * B, BLASLONG ldb, FLOAT* beta, FLOAT * C, BLASLONG ldc) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT beta0, FLOAT beta1, FLOAT * C, BLASLONG ldc) { FLOAT real, imag; FLOAT tmp0, tmp1; @@ -65,12 +65,12 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* al #endif } - tmp0 = beta[0]*C[j*2*ldc + 2*i] - beta[1]*C[j*2*ldc+ 2*i + 1]; - tmp1 = beta[0]*C[j*2*ldc+ 2*i + 1] + beta[1]*C[j*2*ldc + 2*i]; + tmp0 = beta0*C[j*2*ldc + 2*i] - beta1*C[j*2*ldc+ 2*i + 1]; + tmp1 = beta0*C[j*2*ldc+ 2*i + 1] + beta1*C[j*2*ldc + 2*i]; - C[j*2*ldc + 2*i] =tmp0+ alpha[0]*real - alpha[1]*imag; - C[j*2*ldc+ 2*i + 1] = tmp1+ alpha[0]*imag + real*alpha[1]; + C[j*2*ldc + 2*i] =tmp0+ alpha0*real - alpha1*imag; + C[j*2*ldc+ 2*i + 1] = tmp1+ alpha0*imag + real*alpha1; } } diff --git a/kernel/generic/zgemm_small_matrix_kernel_nt.c b/kernel/generic/zgemm_small_matrix_kernel_nt.c index 3c81ad7..288e49c 100644 --- a/kernel/generic/zgemm_small_matrix_kernel_nt.c +++ b/kernel/generic/zgemm_small_matrix_kernel_nt.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* alpha, FLOAT * B, BLASLONG ldb, FLOAT* beta, FLOAT * C, BLASLONG ldc) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT beta0, FLOAT beta1, FLOAT * C, BLASLONG ldc) { FLOAT real, imag; FLOAT tmp0, tmp1; @@ -69,12 +69,12 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* al #endif } - tmp0 = beta[0]*C[j*2*ldc + 2*i] - beta[1]*C[j*2*ldc+ 2*i + 1]; - tmp1 = beta[0]*C[j*2*ldc+ 2*i + 1] + beta[1]*C[j*2*ldc + 2*i]; + tmp0 = beta0*C[j*2*ldc + 2*i] - beta1*C[j*2*ldc+ 2*i + 1]; + tmp1 = beta0*C[j*2*ldc+ 2*i + 1] + beta1*C[j*2*ldc + 2*i]; - C[j*2*ldc + 2*i] =tmp0+ alpha[0]*real - alpha[1]*imag; - C[j*2*ldc+ 2*i + 1] = tmp1+ alpha[0]*imag + real*alpha[1]; + C[j*2*ldc + 2*i] =tmp0+ alpha0*real - alpha1*imag; + C[j*2*ldc+ 2*i + 1] = tmp1+ alpha0*imag + real*alpha1; } } diff --git a/kernel/generic/zgemm_small_matrix_kernel_tn.c b/kernel/generic/zgemm_small_matrix_kernel_tn.c index 143190b..1e2a5ae 100644 --- a/kernel/generic/zgemm_small_matrix_kernel_tn.c +++ b/kernel/generic/zgemm_small_matrix_kernel_tn.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* alpha, FLOAT * B, BLASLONG ldb, FLOAT* beta, FLOAT * C, BLASLONG ldc) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT beta0, FLOAT beta1, FLOAT * C, BLASLONG ldc) { FLOAT real, imag; FLOAT tmp0, tmp1; @@ -69,12 +69,12 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* al #endif } - tmp0 = beta[0]*C[j*2*ldc + 2*i] - beta[1]*C[j*2*ldc+ 2*i + 1]; - tmp1 = beta[0]*C[j*2*ldc+ 2*i + 1] + beta[1]*C[j*2*ldc + 2*i]; + tmp0 = beta0*C[j*2*ldc + 2*i] - beta1*C[j*2*ldc+ 2*i + 1]; + tmp1 = beta0*C[j*2*ldc+ 2*i + 1] + beta1*C[j*2*ldc + 2*i]; - C[j*2*ldc + 2*i] =tmp0+ alpha[0]*real - alpha[1]*imag; - C[j*2*ldc+ 2*i + 1] = tmp1+ alpha[0]*imag + real*alpha[1]; + C[j*2*ldc + 2*i] =tmp0+ alpha0*real - alpha1*imag; + C[j*2*ldc+ 2*i + 1] = tmp1+ alpha0*imag + real*alpha1; } } diff --git a/kernel/generic/zgemm_small_matrix_kernel_tt.c b/kernel/generic/zgemm_small_matrix_kernel_tt.c index 246e26e..1800435 100644 --- a/kernel/generic/zgemm_small_matrix_kernel_tt.c +++ b/kernel/generic/zgemm_small_matrix_kernel_tt.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* alpha, FLOAT * B, BLASLONG ldb, FLOAT* beta, FLOAT * C, BLASLONG ldc) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT beta0, FLOAT beta1, FLOAT * C, BLASLONG ldc) { FLOAT real, imag; FLOAT tmp0, tmp1; @@ -69,12 +69,12 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* al #endif } - tmp0 = beta[0]*C[j*2*ldc + 2*i] - beta[1]*C[j*2*ldc+ 2*i + 1]; - tmp1 = beta[0]*C[j*2*ldc+ 2*i + 1] + beta[1]*C[j*2*ldc + 2*i]; + tmp0 = beta0*C[j*2*ldc + 2*i] - beta1*C[j*2*ldc+ 2*i + 1]; + tmp1 = beta0*C[j*2*ldc+ 2*i + 1] + beta1*C[j*2*ldc + 2*i]; - C[j*2*ldc + 2*i] =tmp0+ alpha[0]*real - alpha[1]*imag; - C[j*2*ldc+ 2*i + 1] = tmp1+ alpha[0]*imag + real*alpha[1]; + C[j*2*ldc + 2*i] =tmp0+ alpha0*real - alpha1*imag; + C[j*2*ldc+ 2*i + 1] = tmp1+ alpha0*imag + real*alpha1; } } -- 2.7.4