From 40b14e4957b9a5d9bbda30fc10aeeba485755f3c Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Wed, 29 Dec 2021 11:42:04 +0100 Subject: [PATCH] fix zgemm kernel --- kernel/arm64/zgemm_kernel_sve_v1x4.S | 59 +++++++++++++++++------------------- kernel/arm64/zgemm_ncopy_sve_v1.c | 2 +- kernel/arm64/zgemm_tcopy_sve_v1.c | 2 +- 3 files changed, 29 insertions(+), 34 deletions(-) diff --git a/kernel/arm64/zgemm_kernel_sve_v1x4.S b/kernel/arm64/zgemm_kernel_sve_v1x4.S index 1201d6d..d5b3577 100644 --- a/kernel/arm64/zgemm_kernel_sve_v1x4.S +++ b/kernel/arm64/zgemm_kernel_sve_v1x4.S @@ -53,12 +53,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define alphaR x19 #define alphaI x20 -#define alphaz_R z10.d -#define alphaz_I z11.d -#define alpha0_R d10 -#define alphaV0_R v10.d[0] -#define alpha0_I d11 -#define alphaV0_I v11.d[0] +#define alphaz_R z6.d +#define alphaz_I z7.d +#define alpha0_R d6 +#define alpha0_I d7 #define A_PRE_SIZE 2560 @@ -170,8 +168,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNELv1x4_I ld2d {z0.d, z1.d}, p1/z, [pA] - ld2d {z2.d, z3.d}, p1/z, [pA, #2, mul vl] // next one - add pA, pA, lanes, lsl #5 // pA += lanes*2*2*8 + add pA, pA, lanes, lsl #4 // pA += lanes*2*8 + ld2d {z2.d, z3.d}, p1/z, [pA] // next one + add pA, pA, lanes, lsl #4 // pA += lanes*2*8 ld1rd z8.d, p0/z, [pB] ld1rd z9.d, p0/z, [pB, 8] @@ -283,7 +282,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNELv1x4_M2 - ld2d {z2.d, z3.d}, p1/z, [pA] + ld2d {z0.d, z1.d}, p1/z, [pA] add pA, pA, lanes, lsl #4 // pA = pA + lanes *2 * 8 OP_rr z16.d, p1/m, z2.d, z8.d @@ -396,39 +395,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmls z24.d, p1/m, z17.d, alphaz_I fmla z25.d, p1/m, z16.d, alphaz_I fmla z25.d, p1/m, z17.d, alphaz_R - st2d {z25.d, z26.d}, p1, [pCRow0] + st2d {z24.d, z25.d}, p1, [pCRow0] - add pCRow0, pCRow0, #32 + add pCRow0, pCRow0, lanes, lsl #4 - ld2d {z26.d, z27.d}, p1/z, [pCRow0] + ld2d {z26.d, z27.d}, p1/z, [pCRow1] fmla z26.d, p1/m, z18.d, alphaz_R fmls z26.d, p1/m, z19.d, alphaz_I fmla z27.d, p1/m, z18.d, alphaz_I fmla z27.d, p1/m, z19.d, alphaz_R - st2d {z26.d, z27.d}, p1, [pCRow0] + st2d {z26.d, z27.d}, p1, [pCRow1] - add pCRow0, pCRow0, #32 + add pCRow1, pCRow1, lanes, lsl #4 prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] - ld2d {z28.d, z29.d}, p1/z, [pCRow1] + ld2d {z28.d, z29.d}, p1/z, [pCRow2] fmla z28.d, p1/m, z20.d, alphaz_R fmls z28.d, p1/m, z21.d, alphaz_I fmla z29.d, p1/m, z20.d, alphaz_I fmla z29.d, p1/m, z21.d, alphaz_R - st2d {z28.d, z29.d}, p1, [pCRow1] + st2d {z28.d, z29.d}, p1, [pCRow2] - add pCRow1, pCRow1, #32 + add pCRow2, pCRow2, lanes, lsl #4 - ld2d {z30.d, z31.d}, p1/z, [pCRow1] + ld2d {z30.d, z31.d}, p1/z, [pCRow3] fmla z30.d, p1/m, z22.d, alphaz_R fmls z30.d, p1/m, z23.d, alphaz_I fmla z31.d, p1/m, z22.d, alphaz_I fmla z31.d, p1/m, z23.d, alphaz_R - st2d {z30.d, z31.d}, p1, [pCRow1] + st2d {z30.d, z31.d}, p1, [pCRow3] - prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] - add pCRow0, pCRow0, lanes, lsl #4 // pC = pC + lanes * 2 *8 + add pCRow3, pCRow3, lanes, lsl #4 // pC = pC + lanes * 2 *8 prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] @@ -474,24 +473,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmls z24.d, p1/m, z17.d, alphaz_I fmla z25.d, p1/m, z16.d, alphaz_I fmla z25.d, p1/m, z17.d, alphaz_R - st2d {z25.d, z26.d}, p1, [pCRow0] + st2d {z24.d, z25.d}, p1, [pCRow0] - add pCRow0, pCRow0, #32 + add pCRow0, pCRow0, lanes, lsl #4 - ld2d {z26.d, z27.d}, p1/z, [pCRow0] + ld2d {z26.d, z27.d}, p1/z, [pCRow1] fmla z26.d, p1/m, z18.d, alphaz_R fmls z26.d, p1/m, z19.d, alphaz_I fmla z27.d, p1/m, z18.d, alphaz_I fmla z27.d, p1/m, z19.d, alphaz_R - st2d {z26.d, z27.d}, p1, [pCRow0] + st2d {z26.d, z27.d}, p1, [pCRow1] - add pCRow0, pCRow0, #32 + add pCRow1, pCRow1, lanes, lsl #4 prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] - add pCRow0, pCRow0, lanes, lsl #4 // pC = pC + lanes * 2 *8 - .endm /******************************************************************************/ @@ -526,10 +523,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmls z24.d, p1/m, z17.d, alphaz_I fmla z25.d, p1/m, z16.d, alphaz_I fmla z25.d, p1/m, z17.d, alphaz_R - st2d {z25.d, z26.d}, p1, [pCRow0] - - add pCRow0, pCRow0, #32 - + st2d {z24.d, z25.d}, p1, [pCRow0] add pCRow0, pCRow0, lanes, lsl #4 // pC = pC + lanes * 2 *8 @@ -718,6 +712,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ble .Lzgemm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC + add pCRow1, pCRow0, LDC add pC,pC,LDC, lsl #1 diff --git a/kernel/arm64/zgemm_ncopy_sve_v1.c b/kernel/arm64/zgemm_ncopy_sve_v1.c index be18e97..57035f4 100644 --- a/kernel/arm64/zgemm_ncopy_sve_v1.c +++ b/kernel/arm64/zgemm_ncopy_sve_v1.c @@ -65,7 +65,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ svfloat64_t a_vec_imag = svld1_gather_index(pg, ((double *) aoffset1) + 1, lda_vec); svst2_f64(pg, (double *) boffset, svcreate2(a_vec_real, a_vec_imag)); aoffset1 += 2; - boffset += active; + boffset += active * 2; } aoffset += sve_size * lda * 2; diff --git a/kernel/arm64/zgemm_tcopy_sve_v1.c b/kernel/arm64/zgemm_tcopy_sve_v1.c index 085e1fa..32f217d 100644 --- a/kernel/arm64/zgemm_tcopy_sve_v1.c +++ b/kernel/arm64/zgemm_tcopy_sve_v1.c @@ -65,7 +65,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ aoffset1 += lda * 2; boffset += active * 2; } - aoffset += sve_size * 2; + aoffset += active * 2; j += svcntd(); pg = svwhilelt_b64(j, n); -- 2.7.4