From 1d33547222b5c633fae7c0e3f803735e9a20a665 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Sun, 27 Jul 2014 11:51:42 +0200 Subject: [PATCH] optimized zgemm kernel for haswell --- driver/level3/level3.c | 12 +- driver/level3/level3_thread.c | 12 +- kernel/x86_64/zgemm_kernel_4x2_haswell.S | 2262 ++++++++++++++++++++++++++++-- 3 files changed, 2171 insertions(+), 115 deletions(-) diff --git a/driver/level3/level3.c b/driver/level3/level3.c index 2612040..70a6500 100644 --- a/driver/level3/level3.c +++ b/driver/level3/level3.c @@ -333,16 +333,10 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; -#if ( defined(BULLDOZER) || defined(PILEDRIVER) || defined(HASWELL) ) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) - if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; - else - if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; - else - if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; -#else + if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; + else + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; - if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; -#endif START_RPCC(); diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 95860d0..6162a9f 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -367,16 +367,10 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){ min_jj = MIN(n_to, xxx + div_n) - jjs; -#if ( defined(BULLDOZER) || defined(PILEDRIVER) || defined(HASWELL) ) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) - if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; - else - if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; - else - if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; -#else + if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; + else + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; - if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; -#endif START_RPCC(); diff --git a/kernel/x86_64/zgemm_kernel_4x2_haswell.S b/kernel/x86_64/zgemm_kernel_4x2_haswell.S index a71fff7..e23e09e 100644 --- a/kernel/x86_64/zgemm_kernel_4x2_haswell.S +++ b/kernel/x86_64/zgemm_kernel_4x2_haswell.S @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ /******************************************************************************** -* 2014/06/28 Saar +* 2014/07/28 Saar * BLASTEST : OK * CTEST : OK * TEST : OK @@ -40,12 +40,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * A_PR1 512 * B_PR1 512 * -* +* 2014/07/28 Saar * Performance at 4608x4608x4608: -* 1 thread: 43 GFLOPS (SANDYBRIDGE: 29) (MKL: 53) -* 2 threads: 85 GFLOPS (SANDYBRIDGE: 59) (MKL: 100) -* 3 threads: 122 GFLOPS (SANDYBRIDGE: 86) (MKL: 138) -* 4 threads: 156 GFLOPS (SANDYBRIDGE: 108) (MKL: 172) +* 1 thread: 53 GFLOPS (SANDYBRIDGE: 29) (MKL: 53) +* 2 threads: 101 GFLOPS (SANDYBRIDGE: 59) (MKL: 100) +* 3 threads: 146 GFLOPS (SANDYBRIDGE: 86) (MKL: 138) +* 4 threads: 184 GFLOPS (SANDYBRIDGE: 108) (MKL: 172) * ********************************************************************************/ @@ -191,6 +191,379 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define A_PR1 512 #define B_PR1 512 + + + +/***************************************************************************************************/ + +.macro KERNEL4x3_SUB + vmovups (AO), %ymm0 + vmovups 4 * SIZE(AO), %ymm1 + prefetcht0 A_PR1(AO) + + vbroadcastsd (BO), %ymm2 + vbroadcastsd 1 * SIZE(BO), %ymm3 + VFMADDPD_R( %ymm8 ,%ymm2,%ymm0 ) + VFMADDPD_R( %ymm12,%ymm2,%ymm1 ) + VFMADDPD_I( %ymm9 ,%ymm3,%ymm0 ) + VFMADDPD_I( %ymm13,%ymm3,%ymm1 ) + + vbroadcastsd 2 * SIZE(BO), %ymm2 + vbroadcastsd 3 * SIZE(BO), %ymm3 + VFMADDPD_R( %ymm10,%ymm2,%ymm0 ) + VFMADDPD_R( %ymm14,%ymm2,%ymm1 ) + VFMADDPD_I( %ymm11,%ymm3,%ymm0 ) + VFMADDPD_I( %ymm15,%ymm3,%ymm1 ) + + vbroadcastsd 4 * SIZE(BO), %ymm2 + vbroadcastsd 5 * SIZE(BO), %ymm3 + VFMADDPD_R( %ymm4 ,%ymm2,%ymm0 ) + VFMADDPD_R( %ymm6 ,%ymm2,%ymm1 ) + VFMADDPD_I( %ymm5 ,%ymm3,%ymm0 ) + VFMADDPD_I( %ymm7 ,%ymm3,%ymm1 ) + + addq $6*SIZE, BO + addq $8*SIZE, AO + decq %rax +.endm + +.macro SAVE4x3 + + vbroadcastsd ALPHA_R, %ymm0 + vbroadcastsd ALPHA_I, %ymm1 + + // swap high and low 8 bytes + vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 + vshufpd $ 0x05, %ymm11, %ymm11, %ymm11 + vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 + vshufpd $ 0x05, %ymm15, %ymm15, %ymm15 + vshufpd $ 0x05, %ymm5 , %ymm5 , %ymm5 + vshufpd $ 0x05, %ymm7 , %ymm7 , %ymm7 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %ymm9, %ymm8 , %ymm8 + vaddsubpd %ymm11,%ymm10, %ymm10 + vaddsubpd %ymm13,%ymm12, %ymm12 + vaddsubpd %ymm15,%ymm14, %ymm14 + vaddsubpd %ymm5 ,%ymm4 , %ymm4 + vaddsubpd %ymm7 ,%ymm6 , %ymm6 + + vshufpd $ 0x05, %ymm8 , %ymm8 , %ymm9 + vshufpd $ 0x05, %ymm10, %ymm10, %ymm11 + vshufpd $ 0x05, %ymm12, %ymm12, %ymm13 + vshufpd $ 0x05, %ymm14, %ymm14, %ymm15 + vshufpd $ 0x05, %ymm4 , %ymm4 , %ymm5 + vshufpd $ 0x05, %ymm6 , %ymm6 , %ymm7 + +#else + vaddsubpd %ymm8, %ymm9 ,%ymm9 + vaddsubpd %ymm10, %ymm11,%ymm11 + vaddsubpd %ymm12, %ymm13,%ymm13 + vaddsubpd %ymm14, %ymm15,%ymm15 + vaddsubpd %ymm4 , %ymm5 ,%ymm5 + vaddsubpd %ymm6 , %ymm7 ,%ymm7 + + vmovapd %ymm9, %ymm8 + vmovapd %ymm11, %ymm10 + vmovapd %ymm13, %ymm12 + vmovapd %ymm15, %ymm14 + vmovapd %ymm5 , %ymm4 + vmovapd %ymm7 , %ymm6 + + // swap high and low 8 bytes + vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 + vshufpd $ 0x05, %ymm11, %ymm11, %ymm11 + vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 + vshufpd $ 0x05, %ymm15, %ymm15, %ymm15 + vshufpd $ 0x05, %ymm5 , %ymm5 , %ymm5 + vshufpd $ 0x05, %ymm7 , %ymm7 , %ymm7 + +#endif + + // multiply with ALPHA_R + vmulpd %ymm8 , %ymm0, %ymm8 + vmulpd %ymm10, %ymm0, %ymm10 + vmulpd %ymm12, %ymm0, %ymm12 + vmulpd %ymm14, %ymm0, %ymm14 + vmulpd %ymm4 , %ymm0, %ymm4 + vmulpd %ymm6 , %ymm0, %ymm6 + + // multiply with ALPHA_I + vmulpd %ymm9 , %ymm1, %ymm9 + vmulpd %ymm11, %ymm1, %ymm11 + vmulpd %ymm13, %ymm1, %ymm13 + vmulpd %ymm15, %ymm1, %ymm15 + vmulpd %ymm5 , %ymm1, %ymm5 + vmulpd %ymm7 , %ymm1, %ymm7 + + vaddsubpd %ymm9, %ymm8 , %ymm8 + vaddsubpd %ymm11,%ymm10, %ymm10 + vaddsubpd %ymm13,%ymm12, %ymm12 + vaddsubpd %ymm15,%ymm14, %ymm14 + vaddsubpd %ymm5 ,%ymm4 , %ymm4 + vaddsubpd %ymm7 ,%ymm6 , %ymm6 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %ymm8 , %ymm8 + vaddpd 4 * SIZE(CO1), %ymm12, %ymm12 + + vaddpd (CO1, LDC), %ymm10, %ymm10 + vaddpd 4 * SIZE(CO1, LDC), %ymm14, %ymm14 + + vaddpd (CO1, LDC,2), %ymm4 , %ymm4 + vaddpd 4 * SIZE(CO1, LDC,2), %ymm6 , %ymm6 +#endif + + vmovups %ymm8 , (CO1) + vmovups %ymm12 , 4 * SIZE(CO1) + + vmovups %ymm10 , (CO1, LDC) + vmovups %ymm14 , 4 * SIZE(CO1, LDC) + + vmovups %ymm4 , (CO1, LDC, 2) + vmovups %ymm6 , 4 * SIZE(CO1, LDC, 2) + + prefetcht0 64(CO1) + prefetcht0 64(CO1, LDC) + +.endm + + + +/***************************************************************************************************/ + +.macro KERNEL2x3_SUB + vmovups (AO), %xmm0 + vmovups 2 * SIZE(AO), %xmm1 + vmovddup (BO), %xmm2 + vmovddup 1 * SIZE(BO), %xmm3 + + VFMADDPD_R( %xmm8 ,%xmm2,%xmm0 ) + VFMADDPD_R( %xmm12,%xmm2,%xmm1 ) + VFMADDPD_I( %xmm9 ,%xmm3,%xmm0 ) + VFMADDPD_I( %xmm13,%xmm3,%xmm1 ) + + vmovddup 2 * SIZE(BO), %xmm2 + vmovddup 3 * SIZE(BO), %xmm3 + VFMADDPD_R( %xmm10,%xmm2,%xmm0 ) + VFMADDPD_R( %xmm14,%xmm2,%xmm1 ) + VFMADDPD_I( %xmm11,%xmm3,%xmm0 ) + VFMADDPD_I( %xmm15,%xmm3,%xmm1 ) + + vmovddup 4 * SIZE(BO), %xmm2 + vmovddup 5 * SIZE(BO), %xmm3 + VFMADDPD_R( %xmm4 ,%xmm2,%xmm0 ) + VFMADDPD_R( %xmm6 ,%xmm2,%xmm1 ) + VFMADDPD_I( %xmm5 ,%xmm3,%xmm0 ) + VFMADDPD_I( %xmm7 ,%xmm3,%xmm1 ) + + addq $6*SIZE, BO + addq $4*SIZE, AO + decq %rax +.endm + +.macro SAVE2x3 + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 + vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 + vshufpd $ 0x01, %xmm15, %xmm15, %xmm15 + vshufpd $ 0x01, %xmm5 , %xmm5 , %xmm5 + vshufpd $ 0x01, %xmm7 , %xmm7 , %xmm7 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + vaddsubpd %xmm13,%xmm12, %xmm12 + vaddsubpd %xmm15,%xmm14, %xmm14 + vaddsubpd %xmm5, %xmm4 , %xmm4 + vaddsubpd %xmm7, %xmm6 , %xmm6 + + vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $ 0x01, %xmm10, %xmm10, %xmm11 + vshufpd $ 0x01, %xmm12, %xmm12, %xmm13 + vshufpd $ 0x01, %xmm14, %xmm14, %xmm15 + vshufpd $ 0x01, %xmm4 , %xmm4, %xmm5 + vshufpd $ 0x01, %xmm6 , %xmm6, %xmm7 + +#else + vaddsubpd %xmm8, %xmm9 ,%xmm9 + vaddsubpd %xmm10, %xmm11,%xmm11 + vaddsubpd %xmm12, %xmm13,%xmm13 + vaddsubpd %xmm14, %xmm15,%xmm15 + vaddsubpd %xmm4, %xmm5 ,%xmm5 + vaddsubpd %xmm6, %xmm7 ,%xmm7 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm11, %xmm10 + vmovapd %xmm13, %xmm12 + vmovapd %xmm15, %xmm14 + vmovapd %xmm5, %xmm4 + vmovapd %xmm7, %xmm6 + + // swap high and low 64 bytes + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 + vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 + vshufpd $ 0x01, %xmm15, %xmm15, %xmm15 + vshufpd $ 0x01, %xmm5 , %xmm5, %xmm5 + vshufpd $ 0x01, %xmm7 , %xmm7, %xmm7 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm10, %xmm0, %xmm10 + vmulpd %xmm12, %xmm0, %xmm12 + vmulpd %xmm14, %xmm0, %xmm14 + vmulpd %xmm4 , %xmm0, %xmm4 + vmulpd %xmm6 , %xmm0, %xmm6 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm11, %xmm1, %xmm11 + vmulpd %xmm13, %xmm1, %xmm13 + vmulpd %xmm15, %xmm1, %xmm15 + vmulpd %xmm5 , %xmm1, %xmm5 + vmulpd %xmm7 , %xmm1, %xmm7 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + vaddsubpd %xmm13,%xmm12, %xmm12 + vaddsubpd %xmm15,%xmm14, %xmm14 + vaddsubpd %xmm5, %xmm4 , %xmm4 + vaddsubpd %xmm7, %xmm6 , %xmm6 + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 + + vaddpd (CO1, LDC), %xmm10, %xmm10 + vaddpd 2 * SIZE(CO1, LDC), %xmm14, %xmm14 + + vaddpd (CO1, LDC,2), %xmm4 , %xmm4 + vaddpd 2 * SIZE(CO1, LDC,2), %xmm6 , %xmm6 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 2 * SIZE(CO1) + + vmovups %xmm10 , (CO1, LDC) + vmovups %xmm14 , 2 * SIZE(CO1, LDC) + + vmovups %xmm4 , (CO1, LDC,2) + vmovups %xmm6 , 2 * SIZE(CO1, LDC,2) + +.endm + + +/************************************************************************************************/ + + +.macro KERNEL1x3_SUB + vmovups (AO), %xmm0 + vmovddup (BO), %xmm2 + vmovddup 1 * SIZE(BO), %xmm3 + + VFMADDPD_R( %xmm8,%xmm2,%xmm0 ) + VFMADDPD_I( %xmm9,%xmm3,%xmm0 ) + + vmovddup 2 * SIZE(BO), %xmm2 + vmovddup 3 * SIZE(BO), %xmm3 + VFMADDPD_R( %xmm10,%xmm2,%xmm0 ) + VFMADDPD_I( %xmm11,%xmm3,%xmm0 ) + + vmovddup 4 * SIZE(BO), %xmm2 + vmovddup 5 * SIZE(BO), %xmm3 + VFMADDPD_R( %xmm4 ,%xmm2,%xmm0 ) + VFMADDPD_I( %xmm5 ,%xmm3,%xmm0 ) + + addq $6*SIZE, BO + addq $2*SIZE, AO + decq %rax +.endm + +.macro SAVE1x3 + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 + vshufpd $ 0x01, %xmm5 , %xmm5, %xmm5 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + vaddsubpd %xmm5, %xmm4 , %xmm4 + + vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $ 0x01, %xmm10, %xmm10, %xmm11 + vshufpd $ 0x01, %xmm4 , %xmm4, %xmm5 + +#else + vaddsubpd %xmm8, %xmm9, %xmm9 + vaddsubpd %xmm10,%xmm11, %xmm11 + vaddsubpd %xmm4, %xmm5, %xmm5 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm11, %xmm10 + vmovapd %xmm5, %xmm4 + + // swap high and low 64 bytes + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 + vshufpd $ 0x01, %xmm5 , %xmm5, %xmm5 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm10, %xmm0, %xmm10 + vmulpd %xmm4 , %xmm0, %xmm4 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm11, %xmm1, %xmm11 + vmulpd %xmm5 , %xmm1, %xmm5 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + vaddsubpd %xmm5, %xmm4 , %xmm4 + +#ifndef TRMMKERNEL + + vaddpd (CO1) , %xmm8 , %xmm8 + vaddpd (CO1, LDC) , %xmm10, %xmm10 + vaddpd (CO1, LDC,2) , %xmm4 , %xmm4 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm10 , (CO1, LDC) + vmovups %xmm4 , (CO1, LDC,2) + +.endm + + + + /***************************************************************************************************/ .macro KERNEL4x2_SUB @@ -532,150 +905,1841 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -#ifndef TRMMKERNEL +#ifndef TRMMKERNEL + + vaddpd (CO1), %ymm8 , %ymm8 + vaddpd 4 * SIZE(CO1), %ymm12, %ymm12 + +#endif + + vmovups %ymm8 , (CO1) + vmovups %ymm12 ,4 * SIZE(CO1) + +.endm + + + +/************************************************************************************************/ + +.macro KERNEL2x1_SUB + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 + VFMADDPD_R( %xmm8,%xmm4,%xmm0 ) + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 + VFMADDPD_R( %xmm12,%xmm4,%xmm1 ) + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) + VFMADDPD_I( %xmm13,%xmm5,%xmm1 ) + addq $ 2, BI + addq $ 4, %rax +.endm + +.macro SAVE2x1 + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm13,%xmm12 , %xmm12 + + vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $ 0x01, %xmm12, %xmm12, %xmm13 + +#else + vaddsubpd %xmm8, %xmm9 , %xmm9 + vaddsubpd %xmm12,%xmm13, %xmm13 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm13, %xmm12 + + // swap high and low 64 bytes + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm12, %xmm0, %xmm12 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm13, %xmm1, %xmm13 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm13, %xmm12, %xmm12 + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 2 * SIZE(CO1) + +.endm + + +/************************************************************************************************/ + +.macro KERNEL1x1_SUB + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 + VFMADDPD_R( %xmm8,%xmm4,%xmm0 ) + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) + addq $ 2, BI + addq $ 2, %rax +.endm + +.macro SAVE1x1 + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8, %xmm8 + + vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 + +#else + vaddsubpd %xmm8, %xmm9, %xmm9 + + vmovapd %xmm9, %xmm8 + + // swap high and low 64 bytes + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + + vaddsubpd %xmm9 ,%xmm8, %xmm8 + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + +#endif + + vmovups %xmm8 , (CO1) + +.endm + + +/************************************************************************************************/ + + + +#if !defined(TRMMKERNEL) + + + PROLOGUE + PROFCODE + + subq $ STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + vmovups %xmm6, 64(%rsp) + vmovups %xmm7, 80(%rsp) + vmovups %xmm8, 96(%rsp) + vmovups %xmm9, 112(%rsp) + vmovups %xmm10, 128(%rsp) + vmovups %xmm11, 144(%rsp) + vmovups %xmm12, 160(%rsp) + vmovups %xmm13, 176(%rsp) + vmovups %xmm14, 192(%rsp) + vmovups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + vmovsd OLD_ALPHA_I, %xmm1 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $ 128 + L_BUFFER_SIZE, %rsp + andq $ -4096, %rsp # align stack + + STACK_TOUCH + + cmpq $ 0, OLD_M + je .L999 + + cmpq $ 0, OLD_N + je .L999 + + cmpq $ 0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA_R + vmovsd %xmm1, ALPHA_I + + salq $ ZBASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $ 6, %rdi + divq %rdi // N / 6 + movq %rax, Ndiv6 // N / 6 + movq %rdx, Nmod6 // N % 6 + + + +/************************************************************************************************/ +.L6_00_0: + + movq Ndiv6, J + cmpq $ 0, J + je .L2_00_0 + ALIGN_4 + + + +.L6_00_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + salq $2, %rax // 2 * COMPSIZE + leaq (B, %rax,8), BO2 + movq BO2, B // next offset of B + movq K, %rax + ALIGN_4 + +.L6_00_02b: + + vmovups (BO1), %xmm0 + vmovups 2 * SIZE(BO1), %xmm1 + vmovups (BO2), %xmm2 + vmovups %xmm0, (BO) + vmovups %xmm1, 2 * SIZE(BO) + vmovups %xmm2, 4 * SIZE(BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO2 + addq $ 6*SIZE,BO + decq %rax + jnz .L6_00_02b + +.L6_00_02c: + + + +.L6_00_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + leaq (C, LDC, 1), C // c += 1 * ldc + + movq A, AO // aoffset = a + + movq M, I + sarq $ 2, I // i = (m >> 2) + je .L6_2_10 + + ALIGN_4 + +/******************************************************************************************************************/ + +.L6_4_11: + + leaq BUFFER1, BO // first buffer to BO + + vzeroall + + movq K, %rax + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L6_4_16 + ALIGN_4 + +.L6_4_12: + + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + + je .L6_4_16 + + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + + je .L6_4_16 + + jmp .L6_4_12 + ALIGN_4 + +.L6_4_16: + movq K, %rax + + andq $ 7, %rax # if (k & 1) + je .L6_4_19 + ALIGN_4 + +.L6_4_17: + + KERNEL4x3_SUB + + jnz .L6_4_17 + ALIGN_4 + + +.L6_4_19: + + SAVE4x3 + + addq $ 8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L6_4_11 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ + + +/******************************************************************************************************************/ +.L6_2_10: + testq $ 2, M + jz .L6_2_40 // to next 2 lines of N + +.L6_2_11: + + leaq BUFFER1, BO // first buffer to BO + + vzeroall + + movq K, %rax + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L6_2_16 + ALIGN_4 + +.L6_2_12: + + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + + je .L6_2_16 + + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + + je .L6_2_16 + + jmp .L6_2_12 + ALIGN_4 + +.L6_2_16: + movq K, %rax + + andq $ 7, %rax # if (k & 1) + je .L6_2_19 + ALIGN_4 + +.L6_2_17: + + KERNEL2x3_SUB + + jnz .L6_2_17 + ALIGN_4 + + +.L6_2_19: + + SAVE2x3 + + addq $ 4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L6_2_40: + testq $ 1, M + jz .L6_2_60 // to next 2 lines of N + + ALIGN_4 + +.L6_2_41: + + leaq BUFFER1, BO // first buffer to BO + + vzeroall + + movq K, %rax + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L6_2_46 + + ALIGN_4 + +.L6_2_42: + + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + + je .L6_2_46 + + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + + je .L6_2_46 + + jmp .L6_2_42 + ALIGN_4 + +.L6_2_46: + movq K, %rax + + andq $ 7, %rax # if (k & 1) + je .L6_2_49 + + ALIGN_4 + +.L6_2_47: + + KERNEL1x3_SUB + + jnz .L6_2_47 + ALIGN_4 + + +.L6_2_49: + + SAVE1x3 + + addq $ 2 * SIZE, CO1 # coffset += 2 + decq I # i -- + jg .L6_2_41 + ALIGN_4 + + + + +.L6_2_60: + + +/************************************************************************************************/ + +/************************************************************************************************/ + + +.L7_00_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + salq $2, %rax // 2 * COMPSIZE + leaq (B, %rax,8), BO2 + movq K, %rax + ALIGN_4 + +.L7_00_02b: + + vmovups 2 * SIZE(BO1), %xmm0 + vmovups (BO2), %xmm1 + vmovups 2 * SIZE(BO2), %xmm2 + vmovups %xmm0, (BO) + vmovups %xmm1, 2 * SIZE(BO) + vmovups %xmm2, 4 * SIZE(BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO2 + addq $ 6*SIZE,BO + decq %rax + jnz .L7_00_02b + +.L7_00_02c: + + movq BO2, B // next offset of B + + +.L7_00_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + leaq (C, LDC, 1), C // c += 1 * ldc + + movq A, AO // aoffset = a + + movq M, I + sarq $ 2, I // i = (m >> 2) + je .L7_2_10 + + ALIGN_4 + +/******************************************************************************************************************/ + +.L7_4_11: + + leaq BUFFER1, BO // first buffer to BO + + vzeroall + + movq K, %rax + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L7_4_16 + ALIGN_4 + +.L7_4_12: + + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + + je .L7_4_16 + + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + + je .L7_4_16 + + jmp .L7_4_12 + ALIGN_4 + +.L7_4_16: + movq K, %rax + + andq $ 7, %rax # if (k & 1) + je .L7_4_19 + + ALIGN_4 + +.L7_4_17: + + KERNEL4x3_SUB + + jnz .L7_4_17 + ALIGN_4 + + +.L7_4_19: + + SAVE4x3 + + addq $ 8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L7_4_11 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ + + +/******************************************************************************************************************/ +.L7_2_10: + testq $ 2, M + jz .L7_2_40 // to next 2 lines of N + +.L7_2_11: + + leaq BUFFER1, BO // first buffer to BO + + vzeroall + + movq K, %rax + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L7_2_16 + ALIGN_4 + +.L7_2_12: + + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + + je .L7_2_16 + + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + + je .L7_2_16 + + jmp .L7_2_12 + ALIGN_4 + +.L7_2_16: + movq K, %rax + + andq $ 7, %rax # if (k & 1) + je .L7_2_19 + + ALIGN_4 + +.L7_2_17: + + KERNEL2x3_SUB + + jnz .L7_2_17 + ALIGN_4 + + +.L7_2_19: + + SAVE2x3 + + addq $ 4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L7_2_40: + testq $ 1, M + jz .L7_2_60 // to next 2 lines of N + + ALIGN_4 + +.L7_2_41: + + leaq BUFFER1, BO // first buffer to BO + + vzeroall + + movq K, %rax + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L7_2_46 + + ALIGN_4 + +.L7_2_42: + + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + + je .L7_2_46 + + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + + je .L7_2_46 + + jmp .L7_2_42 + ALIGN_4 + +.L7_2_46: + movq K, %rax + + andq $ 7, %rax # if (k & 1) + je .L7_2_49 + ALIGN_4 + +.L7_2_47: + + KERNEL1x3_SUB + + jnz .L7_2_47 + ALIGN_4 + + +.L7_2_49: + + SAVE1x3 + + addq $ 2 * SIZE, CO1 # coffset += 2 + decq I # i -- + jg .L7_2_41 + ALIGN_4 + + + + +.L7_2_60: + + decq J // j -- + jg .L6_00_01 // next 6 lines of N + +/************************************************************************************************/ + + + +/************************************************************************************************/ +.L2_00_0: + + movq Nmod6, J + sarq $1, J // j = j / 2 + cmpq $ 0, J + je .L1_2_0 + ALIGN_4 + + + +.L2_00_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_00_02b: + + vmovups (BO1), %xmm0 + vmovups 2 * SIZE(BO1), %xmm1 + vmovups %xmm0, (BO) + vmovups %xmm1, 2 * SIZE(BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO + decq %rax + jnz .L2_00_02b + +.L2_00_02c: + + movq BO1, B // next offset of B + + +.L2_00_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $ 8 * SIZE, AO + + movq M, I + sarq $ 2, I // i = (m >> 2) + je .L2_2_10 + + ALIGN_4 + +/******************************************************************************************************************/ + +.L2_4_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 4, %rax // number of values in AO +#else + addq $ 2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L2_4_16 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_12: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + + je .L2_4_16 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + + je .L2_4_16 + + jmp .L2_4_12 + ALIGN_4 + +.L2_4_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L2_4_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_17: + + KERNEL4x2_SUB + + jl .L2_4_17 + ALIGN_4 + + +.L2_4_19: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 4, KK +#endif + + addq $ 8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L2_4_11 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ + + +/******************************************************************************************************************/ +.L2_2_10: + testq $ 2, M + jz .L2_2_40 // to next 2 lines of N + +.L2_2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 2, %rax // number of values in AO +#else + addq $ 2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L2_2_16 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_2_12: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_2_16 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_2_16 + + jmp .L2_2_12 + ALIGN_4 + +.L2_2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L2_2_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_2_17: + + KERNEL2x2_SUB + + jl .L2_2_17 + ALIGN_4 + + +.L2_2_19: + + SAVE2x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 2, KK +#endif + + addq $ 4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_2_40: + testq $ 1, M + jz .L2_2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_2_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 1, %rax // number of values in AO +#else + addq $ 2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L2_2_46 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_2_42: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_2_46 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_2_46 + + jmp .L2_2_42 + ALIGN_4 + +.L2_2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L2_2_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_2_47: + + KERNEL1x2_SUB + + jl .L2_2_47 + ALIGN_4 + + +.L2_2_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 1, KK +#endif + + addq $ 2 * SIZE, CO1 # coffset += 2 + decq I # i -- + jg .L2_2_41 + ALIGN_4 + + + + +.L2_2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $ 2, KK +#endif + + decq J // j -- + jg .L2_00_01 // next 2 lines of N + + + +.L1_2_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $ 1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_00_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_00_02b: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $ 2*SIZE,BO1 + addq $ 2*SIZE,BO + decq %rax + jnz .L1_00_02b + +.L1_00_02c: + + movq BO1, B // next offset of B + +.L1_00_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $ 8 * SIZE, AO + + movq M, I + sarq $ 2, I // i = (m >> 2) + je .L1_2_10 + + ALIGN_4 + +/*******************************************************************************************************/ + + +.L1_4_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 4, %rax // number of values in AO +#else + addq $ 1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L1_4_16 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_12: + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_4_16 + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_4_16 + + jmp .L1_4_12 + ALIGN_4 + +.L1_4_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L1_4_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_17: + + KERNEL4x1_SUB + + jl .L1_4_17 + ALIGN_4 + + +.L1_4_19: + + SAVE4x1 + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 4, KK +#endif + + addq $ 8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L1_4_11 + ALIGN_4 + + + + +/*******************************************************************************************************/ +.L1_2_10: + testq $ 2, M + jz .L1_2_40 + + +.L1_2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 2, %rax // number of values in AO +#else + addq $ 1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L1_2_16 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_2_12: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_2_16 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_2_16 + + jmp .L1_2_12 + ALIGN_4 + +.L1_2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L1_2_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_2_17: + + KERNEL2x1_SUB + + jl .L1_2_17 + ALIGN_4 + + +.L1_2_19: - vaddpd (CO1), %ymm8 , %ymm8 - vaddpd 4 * SIZE(CO1), %ymm12, %ymm12 + SAVE2x1 +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO #endif - vmovups %ymm8 , (CO1) - vmovups %ymm12 ,4 * SIZE(CO1) - -.endm - +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 2, KK +#endif -/************************************************************************************************/ + addq $ 4 * SIZE, CO1 # coffset += 4 -.macro KERNEL2x1_SUB - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 - VFMADDPD_R( %xmm8,%xmm4,%xmm0 ) - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 - VFMADDPD_R( %xmm12,%xmm4,%xmm1 ) - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) - VFMADDPD_I( %xmm13,%xmm5,%xmm1 ) - addq $ 2, BI - addq $ 4, %rax -.endm + ALIGN_4 -.macro SAVE2x1 - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_2_40: + testq $ 1, M + jz .L999 - // swap high and low 64 bytes - vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 + ALIGN_4 -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) +.L1_2_41: - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm13,%xmm12 , %xmm12 +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif - vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 - vshufpd $ 0x01, %xmm12, %xmm12, %xmm13 + vzeroall +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK #else - vaddsubpd %xmm8, %xmm9 , %xmm9 - vaddsubpd %xmm12,%xmm13, %xmm13 + movq KK, %rax +#ifdef LEFT + addq $ 1, %rax // number of values in AO +#else + addq $ 1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif - vmovapd %xmm9, %xmm8 - vmovapd %xmm13, %xmm12 - // swap high and low 64 bytes - vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 + andq $ -8, %rax // K = K - ( K % 8 ) + je .L1_2_46 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values -#endif + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - vmulpd %xmm12, %xmm0, %xmm12 +.L1_2_42: - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - vmulpd %xmm13, %xmm1, %xmm13 + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm13, %xmm12, %xmm12 + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB -#ifndef TRMMKERNEL + je .L1_2_46 - vaddpd (CO1), %xmm8 , %xmm8 - vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB -#endif + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB - vmovups %xmm8 , (CO1) - vmovups %xmm12 , 2 * SIZE(CO1) + je .L1_2_46 -.endm + jmp .L1_2_42 + ALIGN_4 +.L1_2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif -/************************************************************************************************/ + andq $ 7, %rax # if (k & 1) + je .L1_2_49 -.macro KERNEL1x1_SUB - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 - VFMADDPD_R( %xmm8,%xmm4,%xmm0 ) - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) - addq $ 2, BI - addq $ 2, %rax -.endm + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values -.macro SAVE1x1 + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 +.L1_2_47: - // swap high and low 64 bytes - vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + KERNEL1x1_SUB -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) + jl .L1_2_47 + ALIGN_4 - vaddsubpd %xmm9, %xmm8, %xmm8 - vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 +.L1_2_49: -#else - vaddsubpd %xmm8, %xmm9, %xmm9 + SAVE1x1 - vmovapd %xmm9, %xmm8 +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif - // swap high and low 64 bytes - vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 1, KK #endif - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 + addq $ 2 * SIZE, CO1 # coffset += 2 + decq I # i -- + jg .L1_2_41 + ALIGN_4 - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - vaddsubpd %xmm9 ,%xmm8, %xmm8 -#ifndef TRMMKERNEL - vaddpd (CO1), %xmm8 , %xmm8 -#endif - vmovups %xmm8 , (CO1) +.L999: + vzeroupper -.endm + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + vmovups 64(%rsp), %xmm6 + vmovups 80(%rsp), %xmm7 + vmovups 96(%rsp), %xmm8 + vmovups 112(%rsp), %xmm9 + vmovups 128(%rsp), %xmm10 + vmovups 144(%rsp), %xmm11 + vmovups 160(%rsp), %xmm12 + vmovups 176(%rsp), %xmm13 + vmovups 192(%rsp), %xmm14 + vmovups 208(%rsp), %xmm15 +#endif -/************************************************************************************************/ + addq $ STACKSIZE, %rsp + ret + EPILOGUE +#else +/************************************************************************************************ + TRMM Kernel +************************************************************************************************/ PROLOGUE PROFCODE @@ -1811,3 +3875,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ret EPILOGUE + +#endif + + -- 2.7.4