From 46bc4fd50cb0581daa31fce012708680ead4818b Mon Sep 17 00:00:00 2001 From: wernsaar Date: Tue, 29 Jul 2014 08:53:09 +0200 Subject: [PATCH] optimized cgemm kernel for haswell --- kernel/x86_64/cgemm_kernel_8x2_haswell.S | 7212 ++++++++++++++++++++---------- 1 file changed, 4927 insertions(+), 2285 deletions(-) diff --git a/kernel/x86_64/cgemm_kernel_8x2_haswell.S b/kernel/x86_64/cgemm_kernel_8x2_haswell.S index baee3cd..98f4005 100644 --- a/kernel/x86_64/cgemm_kernel_8x2_haswell.S +++ b/kernel/x86_64/cgemm_kernel_8x2_haswell.S @@ -1,2285 +1,4927 @@ -/********************************************************************************* -Copyright (c) 2013, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -**********************************************************************************/ - -/********************************************************************* -* 2014/06/28 Saar -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* -* 2013/10/28 Saar -* Parameter: -* CGEMM_DEFAULT_UNROLL_N 2 -* CGEMM_DEFAULT_UNROLL_M 8 -* CGEMM_DEFAULT_P 384 -* CGEMM_DEFAULT_Q 192 -* A_PR1 512 -* B_PR1 512 -* -* Performance at 6912x6912x6912: -* 1 thread: 84 GFLOPS (SANDYBRIDGE: 60) (MKL: 86) -* 2 threads: 153 GFLOPS (SANDYBRIDGE: 114) (MKL: 155) -* 3 threads: 224 GFLOPS (SANDYBRIDGE: 162) (MKL: 222) -* 4 threads: 278 GFLOPS (SANDYBRIDGE: 223) (MKL: 279) -* -* -*********************************************************************/ - - - -#define ASSEMBLER -#include "common.h" - -#define OLD_M %rdi -#define OLD_N %rsi -#define M %r13 -#define J %r14 -#define OLD_K %rdx - -#define A %rcx -#define B %r8 -#define C %r9 -#define LDC %r10 - -#define I %r11 -#define AO %rdi -#define BO %rsi -#define CO1 %r15 -#define K %r12 -#define BI %rbp -#define SP %rbx - -#define BO1 %rdi -#define BO2 %r15 - -#ifndef WINDOWS_ABI - -#define STACKSIZE 96 - -#else - -#define STACKSIZE 320 - -#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) -#define OLD_A 48 + STACKSIZE(%rsp) -#define OLD_B 56 + STACKSIZE(%rsp) -#define OLD_C 64 + STACKSIZE(%rsp) -#define OLD_LDC 72 + STACKSIZE(%rsp) -#define OLD_OFFSET 80 + STACKSIZE(%rsp) - -#endif - -#define L_BUFFER_SIZE 8192 - -#define Ndiv6 24(%rsp) -#define Nmod6 32(%rsp) -#define N 40(%rsp) -#define ALPHA_R 48(%rsp) -#define ALPHA_I 56(%rsp) -#define OFFSET 64(%rsp) -#define KK 72(%rsp) -#define KKK 80(%rsp) -#define BUFFER1 128(%rsp) - -#if defined(OS_WINDOWS) -#if L_BUFFER_SIZE > 16384 -#define STACK_TOUCH \ - movl $ 0, 4096 * 4(%rsp);\ - movl $ 0, 4096 * 3(%rsp);\ - movl $ 0, 4096 * 2(%rsp);\ - movl $ 0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 12288 -#define STACK_TOUCH \ - movl $ 0, 4096 * 3(%rsp);\ - movl $ 0, 4096 * 2(%rsp);\ - movl $ 0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 8192 -#define STACK_TOUCH \ - movl $ 0, 4096 * 2(%rsp);\ - movl $ 0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 4096 -#define STACK_TOUCH \ - movl $ 0, 4096 * 1(%rsp); -#else -#define STACK_TOUCH -#endif -#else -#define STACK_TOUCH -#endif - - -#if defined(BULLDOZER) - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) - -#define VFMADDPS_R( y0,y1,y2 ) vfmaddps y0,y1,y2,y0 - -#define VFMADDPS_I( y0,y1,y2 ) vfmaddps y0,y1,y2,y0 - -#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) - -#define VFMADDPS_R( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0 - -#define VFMADDPS_I( y0,y1,y2 ) vfmaddps y0,y1,y2,y0 - -#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) - -#define VFMADDPS_R( y0,y1,y2 ) vfmaddps y0,y1,y2,y0 - -#define VFMADDPS_I( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0 - -#else - -#define VFMADDPS_R( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0 - -#define VFMADDPS_I( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0 - -#endif - -#else - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) - -#define VFMADDPS_R( y0,y1,y2 ) vfmadd231ps y1,y2,y0 - -#define VFMADDPS_I( y0,y1,y2 ) vfmadd231ps y1,y2,y0 - -#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) - -#define VFMADDPS_R( y0,y1,y2 ) vfnmadd231ps y1,y2,y0 - -#define VFMADDPS_I( y0,y1,y2 ) vfmadd231ps y1,y2,y0 - -#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) - -#define VFMADDPS_R( y0,y1,y2 ) vfmadd231ps y1,y2,y0 - -#define VFMADDPS_I( y0,y1,y2 ) vfnmadd231ps y1,y2,y0 - -#else - -#define VFMADDPS_R( y0,y1,y2 ) vfnmadd231ps y1,y2,y0 - -#define VFMADDPS_I( y0,y1,y2 ) vfnmadd231ps y1,y2,y0 - -#endif - -#endif - - -#define A_PR1 512 -#define B_PR1 512 - -/***************************************************************************************************************************/ - -.macro KERNEL8x2_SUB - - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - vbroadcastss -8 * SIZE(BO, BI, SIZE), %ymm4 - VFMADDPS_R( %ymm8,%ymm4,%ymm0 ) - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 - VFMADDPS_R( %ymm12,%ymm4,%ymm1 ) - vbroadcastss -7 * SIZE(BO, BI, SIZE), %ymm5 - VFMADDPS_I( %ymm9,%ymm5,%ymm0 ) - VFMADDPS_I( %ymm13,%ymm5,%ymm1 ) - vbroadcastss -6 * SIZE(BO, BI, SIZE), %ymm6 - VFMADDPS_R( %ymm10,%ymm6,%ymm0 ) - VFMADDPS_R( %ymm14,%ymm6,%ymm1 ) - vbroadcastss -5 * SIZE(BO, BI, SIZE), %ymm7 - VFMADDPS_I( %ymm11,%ymm7,%ymm0 ) - VFMADDPS_I( %ymm15,%ymm7,%ymm1 ) - addq $ 4 , BI - addq $ 16, %rax -.endm - -.macro SAVE8x2 - - vbroadcastss ALPHA_R, %ymm0 - vbroadcastss ALPHA_I, %ymm1 - - // swap high and low 64 bytes - vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9 - vshufps $ 0xb1, %ymm11, %ymm11, %ymm11 - vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 - vshufps $ 0xb1, %ymm15, %ymm15, %ymm15 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %ymm9, %ymm8 , %ymm8 - vaddsubps %ymm11,%ymm10, %ymm10 - vaddsubps %ymm13,%ymm12, %ymm12 - vaddsubps %ymm15,%ymm14, %ymm14 - - vshufps $ 0xb1, %ymm8 , %ymm8, %ymm9 - vshufps $ 0xb1, %ymm10, %ymm10, %ymm11 - vshufps $ 0xb1, %ymm12, %ymm12, %ymm13 - vshufps $ 0xb1, %ymm14, %ymm14, %ymm15 - -#else - vaddsubps %ymm8, %ymm9 ,%ymm9 - vaddsubps %ymm10, %ymm11,%ymm11 - vaddsubps %ymm12, %ymm13,%ymm13 - vaddsubps %ymm14, %ymm15,%ymm15 - - vmovaps %ymm9, %ymm8 - vmovaps %ymm11, %ymm10 - vmovaps %ymm13, %ymm12 - vmovaps %ymm15, %ymm14 - - // swap high and low 64 bytes - vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9 - vshufps $ 0xb1, %ymm11, %ymm11, %ymm11 - vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 - vshufps $ 0xb1, %ymm15, %ymm15, %ymm15 - -#endif - - // multiply with ALPHA_R - vmulps %ymm8 , %ymm0, %ymm8 - vmulps %ymm10, %ymm0, %ymm10 - vmulps %ymm12, %ymm0, %ymm12 - vmulps %ymm14, %ymm0, %ymm14 - - // multiply with ALPHA_I - vmulps %ymm9 , %ymm1, %ymm9 - vmulps %ymm11, %ymm1, %ymm11 - vmulps %ymm13, %ymm1, %ymm13 - vmulps %ymm15, %ymm1, %ymm15 - - vaddsubps %ymm9, %ymm8 , %ymm8 - vaddsubps %ymm11,%ymm10, %ymm10 - vaddsubps %ymm13,%ymm12, %ymm12 - vaddsubps %ymm15,%ymm14, %ymm14 - - - -#ifndef TRMMKERNEL - - vaddps (CO1), %ymm8 , %ymm8 - vaddps 8 * SIZE(CO1), %ymm12, %ymm12 - - vaddps (CO1, LDC), %ymm10, %ymm10 - vaddps 8 * SIZE(CO1, LDC), %ymm14, %ymm14 - -#endif - - vmovups %ymm8 , (CO1) - vmovups %ymm12 , 8 * SIZE(CO1) - - vmovups %ymm10 , (CO1, LDC) - vmovups %ymm14 , 8 * SIZE(CO1, LDC) - - prefetcht0 64(CO1) - prefetcht0 64(CO1, LDC) - -.endm - -/***************************************************************************************************************************/ - -.macro KERNEL4x2_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 - vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 - VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 - VFMADDPS_R( %xmm12,%xmm4,%xmm1 ) - vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) - VFMADDPS_I( %xmm13,%xmm5,%xmm1 ) - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 - VFMADDPS_R( %xmm10,%xmm6,%xmm0 ) - VFMADDPS_R( %xmm14,%xmm6,%xmm1 ) - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 - VFMADDPS_I( %xmm11,%xmm7,%xmm0 ) - VFMADDPS_I( %xmm15,%xmm7,%xmm1 ) - addq $ 4, BI - addq $ 8, %rax -.endm - -.macro SAVE4x2 - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 - vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 - vshufps $ 0xb1, %xmm15, %xmm15, %xmm15 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - vaddsubps %xmm13,%xmm12, %xmm12 - vaddsubps %xmm15,%xmm14, %xmm14 - - vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 - vshufps $ 0xb1, %xmm10, %xmm10, %xmm11 - vshufps $ 0xb1, %xmm12, %xmm12, %xmm13 - vshufps $ 0xb1, %xmm14, %xmm14, %xmm15 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - vaddsubps %xmm10, %xmm11,%xmm11 - vaddsubps %xmm12, %xmm13,%xmm13 - vaddsubps %xmm14, %xmm15,%xmm15 - - vmovaps %xmm9, %xmm8 - vmovaps %xmm11, %xmm10 - vmovaps %xmm13, %xmm12 - vmovaps %xmm15, %xmm14 - - // swap high and low 64 bytes - vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 - vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 - vshufps $ 0xb1, %xmm15, %xmm15, %xmm15 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - vmulps %xmm10, %xmm0, %xmm10 - vmulps %xmm12, %xmm0, %xmm12 - vmulps %xmm14, %xmm0, %xmm14 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - vmulps %xmm11, %xmm1, %xmm11 - vmulps %xmm13, %xmm1, %xmm13 - vmulps %xmm15, %xmm1, %xmm15 - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - vaddsubps %xmm13,%xmm12, %xmm12 - vaddsubps %xmm15,%xmm14, %xmm14 - -#ifndef TRMMKERNEL - - vaddps (CO1), %xmm8 , %xmm8 - vaddps 4 * SIZE(CO1), %xmm12, %xmm12 - - vaddps (CO1, LDC), %xmm10, %xmm10 - vaddps 4 * SIZE(CO1, LDC), %xmm14, %xmm14 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm12 , 4 * SIZE(CO1) - - vmovups %xmm10 , (CO1, LDC) - vmovups %xmm14 , 4 * SIZE(CO1, LDC) - -.endm - -/************************************************************************************************/ - -.macro KERNEL2x2_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 - vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 - VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) - vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 - VFMADDPS_R( %xmm10,%xmm6,%xmm0 ) - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 - VFMADDPS_I( %xmm11,%xmm7,%xmm0 ) - addq $ 4, BI - addq $ 4, %rax -.endm - -.macro SAVE2x2 - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 4 bytes - vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - - vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 - vshufps $ 0xb1, %xmm10, %xmm10, %xmm11 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - vaddsubps %xmm10, %xmm11,%xmm11 - - vmovaps %xmm9, %xmm8 - vmovaps %xmm11, %xmm10 - - // swap high and low 4 bytes - vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - vmulps %xmm10, %xmm0, %xmm10 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - vmulps %xmm11, %xmm1, %xmm11 - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - -#ifndef TRMMKERNEL - - vaddps (CO1), %xmm8 , %xmm8 - - vaddps (CO1, LDC), %xmm10, %xmm10 - -#endif - - vmovups %xmm8 , (CO1) - - vmovups %xmm10 , (CO1, LDC) - -.endm - -/************************************************************************************************/ - -.macro KERNEL1x2_SUB - vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 - vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 - VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) - vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 - VFMADDPS_R( %xmm10,%xmm6,%xmm0 ) - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 - VFMADDPS_I( %xmm11,%xmm7,%xmm0 ) - addq $ 4, BI - addq $ 2, %rax -.endm - -.macro SAVE1x2 - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - - vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 - vshufps $ 0xb1, %xmm10, %xmm10, %xmm11 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - vaddsubps %xmm10, %xmm11,%xmm11 - - vmovaps %xmm9, %xmm8 - vmovaps %xmm11, %xmm10 - - // swap high and low 64 bytes - vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - vmulps %xmm10, %xmm0, %xmm10 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - vmulps %xmm11, %xmm1, %xmm11 - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - -#ifndef TRMMKERNEL - - vmovsd (CO1), %xmm14 - vaddps %xmm14, %xmm8 , %xmm8 - - vmovsd (CO1, LDC), %xmm15 - vaddps %xmm15, %xmm10, %xmm10 - -#endif - - vmovsd %xmm8 , (CO1) - vmovsd %xmm10 , (CO1, LDC) - -.endm - -/************************************************************************************************/ - -.macro KERNEL8x1_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm4 - VFMADDPS_R( %ymm8,%ymm4,%ymm0 ) - VFMADDPS_R( %ymm12,%ymm4,%ymm1 ) - vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm5 - VFMADDPS_I( %ymm9,%ymm5,%ymm0 ) - VFMADDPS_I( %ymm13,%ymm5,%ymm1 ) - addq $ 2 , BI - addq $ 16, %rax -.endm - -.macro SAVE8x1 - - vbroadcastss ALPHA_R, %ymm0 - vbroadcastss ALPHA_I, %ymm1 - - // swap high and low 64 bytes - vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9 - vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %ymm9, %ymm8 , %ymm8 - vaddsubps %ymm13,%ymm12, %ymm12 - - vshufps $ 0xb1, %ymm8 , %ymm8, %ymm9 - vshufps $ 0xb1, %ymm12, %ymm12, %ymm13 - -#else - vaddsubps %ymm8, %ymm9 ,%ymm9 - vaddsubps %ymm12, %ymm13,%ymm13 - - vmovaps %ymm9, %ymm8 - vmovaps %ymm13, %ymm12 - - // swap high and low 64 bytes - vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9 - vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 - -#endif - - // multiply with ALPHA_R - vmulps %ymm8 , %ymm0, %ymm8 - vmulps %ymm12, %ymm0, %ymm12 - - // multiply with ALPHA_I - vmulps %ymm9 , %ymm1, %ymm9 - vmulps %ymm13, %ymm1, %ymm13 - - vaddsubps %ymm9, %ymm8 , %ymm8 - vaddsubps %ymm13,%ymm12, %ymm12 - - - -#ifndef TRMMKERNEL - - vaddps (CO1), %ymm8 , %ymm8 - vaddps 8 * SIZE(CO1), %ymm12, %ymm12 - -#endif - - vmovups %ymm8 , (CO1) - vmovups %ymm12 , 8 * SIZE(CO1) - -.endm - - -/************************************************************************************************/ - -.macro KERNEL4x1_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 - VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 - VFMADDPS_R( %xmm12,%xmm4,%xmm1 ) - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) - VFMADDPS_I( %xmm13,%xmm5,%xmm1 ) - addq $ 2, BI - addq $ 8, %rax -.endm - -.macro SAVE4x1 - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 4 bytes - vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm13,%xmm12, %xmm12 - - vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 - vshufps $ 0xb1, %xmm12, %xmm12, %xmm13 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - vaddsubps %xmm12, %xmm13,%xmm13 - - vmovaps %xmm9, %xmm8 - vmovaps %xmm13, %xmm12 - - // swap high and low 4 bytes - vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - vmulps %xmm12, %xmm0, %xmm12 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - vmulps %xmm13, %xmm1, %xmm13 - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm13,%xmm12, %xmm12 - -#ifndef TRMMKERNEL - - vaddps (CO1), %xmm8 , %xmm8 - vaddps 4 * SIZE(CO1), %xmm12, %xmm12 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm12 , 4 * SIZE(CO1) - -.endm - -/************************************************************************************************/ - -.macro KERNEL2x1_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 - VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) - addq $ 2, BI - addq $ 4, %rax -.endm - -.macro SAVE2x1 - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - - vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - - vmovaps %xmm9, %xmm8 - - // swap high and low 64 bytes - vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - - vaddsubps %xmm9, %xmm8 , %xmm8 - -#ifndef TRMMKERNEL - - vaddps (CO1), %xmm8 , %xmm8 - -#endif - - vmovups %xmm8 , (CO1) - -.endm - -/************************************************************************************************/ - -.macro KERNEL1x1_SUB - vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 - VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) - addq $ 2, BI - addq $ 2, %rax -.endm - -.macro SAVE1x1 - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - - vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - - vmovaps %xmm9, %xmm8 - - // swap high and low 64 bytes - vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - - vaddsubps %xmm9, %xmm8 , %xmm8 - -#ifndef TRMMKERNEL - - vmovsd (CO1), %xmm14 - vaddps %xmm14, %xmm8 , %xmm8 - -#endif - - vmovsd %xmm8 , (CO1) - -.endm - -/************************************************************************************************/ - - - - - PROLOGUE - PROFCODE - - subq $ STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - vmovups %xmm6, 64(%rsp) - vmovups %xmm7, 80(%rsp) - vmovups %xmm8, 96(%rsp) - vmovups %xmm9, 112(%rsp) - vmovups %xmm10, 128(%rsp) - vmovups %xmm11, 144(%rsp) - vmovups %xmm12, 160(%rsp) - vmovups %xmm13, 176(%rsp) - vmovups %xmm14, 192(%rsp) - vmovups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - movsd OLD_OFFSET, %xmm12 -#endif - vmovaps %xmm3, %xmm0 - vmovsd OLD_ALPHA_I, %xmm1 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - movsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $ 128 + L_BUFFER_SIZE, %rsp - andq $ -4096, %rsp # align stack - - STACK_TOUCH - - cmpq $ 0, OLD_M - je .L999 - - cmpq $ 0, OLD_N - je .L999 - - cmpq $ 0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovss %xmm0, ALPHA_R - vmovss %xmm1, ALPHA_I - - salq $ ZBASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $ 2, %rdi - divq %rdi // N / 2 - movq %rax, Ndiv6 // N / 2 - movq %rdx, Nmod6 // N % 2 - - - -#ifdef TRMMKERNEL - vmovsd %xmm12, OFFSET - vmovsd %xmm12, KK -#ifndef LEFT - negq KK -#endif -#endif - -.L2_0: - - movq Ndiv6, J - cmpq $ 0, J - je .L1_0 - ALIGN_4 - - - -.L2_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L2_02b: - - vmovups (BO1), %xmm0 - vmovups %xmm0, (BO) - addq $ 4*SIZE,BO1 - addq $ 4*SIZE,BO - decq %rax - jnz .L2_02b - -.L2_02c: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $ 16 * SIZE, AO - - movq M, I - sarq $ 3, I // i = (m >> 3) - je .L2_4_10 - - ALIGN_4 -/**********************************************************************************************************/ - -.L2_8_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 4, %rax // rax = rax *16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 8, %rax // number of values in AO -#else - addq $ 2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L2_8_16 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 4, %rax // rax = rax *16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_8_12: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL8x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL8x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x2_SUB - - je .L2_8_16 - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL8x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL8x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x2_SUB - - je .L2_8_16 - - jmp .L2_8_12 - ALIGN_4 - -.L2_8_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L2_8_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 4, %rax // rax = rax *16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_8_17: - - KERNEL8x2_SUB - - jl .L2_8_17 - ALIGN_4 - - -.L2_8_19: - - SAVE8x2 - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 4, %rax // rax = rax *16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 8, KK -#endif - - addq $ 16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L2_8_11 - ALIGN_4 - - -/**********************************************************************************************************/ - - - - -.L2_4_10: - testq $ 7, M - jz .L2_4_60 // to next 2 lines of N - - testq $ 4, M - jz .L2_4_20 - ALIGN_4 - - -.L2_4_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 4, %rax // number of values in AO -#else - addq $ 2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L2_4_16 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_4_12: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL4x2_SUB - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - KERNEL4x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL4x2_SUB - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - KERNEL4x2_SUB - - je .L2_4_16 - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL4x2_SUB - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - KERNEL4x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL4x2_SUB - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - KERNEL4x2_SUB - - je .L2_4_16 - - jmp .L2_4_12 - ALIGN_4 - -.L2_4_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L2_4_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_4_17: - - KERNEL4x2_SUB - - jl .L2_4_17 - ALIGN_4 - - -.L2_4_19: - - SAVE4x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 4, KK -#endif - - addq $ 8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/************************************************************************** -* Rest of M -***************************************************************************/ - -.L2_4_20: - - testq $ 2, M - jz .L2_4_40 - ALIGN_4 - -.L2_4_21: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 2, %rax // number of values in AO -#else - addq $ 2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L2_4_26 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_4_22: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - je .L2_4_26 - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - je .L2_4_26 - - jmp .L2_4_22 - ALIGN_4 - -.L2_4_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L2_4_29 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_4_27: - - KERNEL2x2_SUB - - jl .L2_4_27 - ALIGN_4 - - -.L2_4_29: - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - - vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 - vshufps $ 0xb1, %xmm10, %xmm10, %xmm11 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - vaddsubps %xmm10, %xmm11,%xmm11 - - vmovaps %xmm9, %xmm8 - vmovaps %xmm11, %xmm10 - - // swap high and low 64 bytes - vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - vmulps %xmm10, %xmm0, %xmm10 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - vmulps %xmm11, %xmm1, %xmm11 - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - - - -#ifndef TRMMKERNEL - - vaddps (CO1), %xmm8 , %xmm8 - - vaddps (CO1, LDC), %xmm10, %xmm10 - -#endif - - vmovups %xmm8 , (CO1) - - vmovups %xmm10 , (CO1, LDC) - - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 2, KK -#endif - - addq $ 4 * SIZE, CO1 # coffset += 4 - decq I # i -- - jg .L2_4_21 - ALIGN_4 - - - -/**************************************************************************/ -.L2_4_40: - testq $ 1, M - jz .L2_4_60 // to next 2 lines of N - - ALIGN_4 - -.L2_4_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 1, %rax // number of values in AO -#else - addq $ 2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L2_4_46 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_4_42: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - je .L2_4_46 - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - je .L2_4_46 - - jmp .L2_4_42 - ALIGN_4 - -.L2_4_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L2_4_49 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_4_47: - - KERNEL1x2_SUB - - jl .L2_4_47 - ALIGN_4 - - -.L2_4_49: - - SAVE1x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 1, KK -#endif - - addq $ 2 * SIZE, CO1 # coffset += 2 - decq I # i -- - jg .L2_4_41 - ALIGN_4 - - - - -.L2_4_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $ 2, KK -#endif - - decq J // j -- - jg .L2_01 // next 2 lines of N - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $ 1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovsd (BO1), %xmm0 - vmovsd %xmm0, (BO) - addq $ 2*SIZE,BO1 - addq $ 2*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $ 16 * SIZE, AO - - movq M, I - sarq $ 3, I // i = (m >> 3) - je .L1_4_10 - - ALIGN_4 - -/**************************************************************************************************/ - -.L1_8_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 4, %rax // rax = rax *16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 8, %rax // number of values in AO -#else - addq $ 1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L1_8_16 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 4, %rax // rax = rax *16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_8_12: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL8x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x1_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x1_SUB - - je .L1_8_16 - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL8x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x1_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x1_SUB - - je .L1_8_16 - - jmp .L1_8_12 - ALIGN_4 - -.L1_8_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L1_8_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 4 ; number of values - - salq $ 4, %rax // rax = rax *16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_8_17: - - KERNEL8x1_SUB - - jl .L1_8_17 - ALIGN_4 - - -.L1_8_19: - - SAVE8x1 - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 4, %rax // rax = rax *16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 8, KK -#endif - - addq $ 16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L1_8_11 - ALIGN_4 - - - -/**************************************************************************************************/ -.L1_4_10: - - testq $ 7, M - jz .L999 - - testq $ 4, M - jz .L1_4_20 - - -.L1_4_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 4, %rax // number of values in AO -#else - addq $ 1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L1_4_16 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_4_12: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL4x1_SUB - KERNEL4x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x1_SUB - KERNEL4x1_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x1_SUB - KERNEL4x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x1_SUB - KERNEL4x1_SUB - - je .L1_4_16 - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL4x1_SUB - KERNEL4x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x1_SUB - KERNEL4x1_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x1_SUB - KERNEL4x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x1_SUB - KERNEL4x1_SUB - - je .L1_4_16 - - jmp .L1_4_12 - ALIGN_4 - -.L1_4_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L1_4_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 4 ; number of values - - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_4_17: - - KERNEL4x1_SUB - - jl .L1_4_17 - ALIGN_4 - - -.L1_4_19: - - SAVE4x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 4, KK -#endif - - addq $ 8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/************************************************************************** -* Rest of M -***************************************************************************/ - -.L1_4_20: - - testq $ 2, M - jz .L1_4_40 - ALIGN_4 - -.L1_4_21: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 2, %rax // number of values in AO -#else - addq $ 1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L1_4_26 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_4_22: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - je .L1_4_26 - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - je .L1_4_26 - - jmp .L1_4_22 - ALIGN_4 - -.L1_4_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L1_4_29 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2; number of values - - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_4_27: - - KERNEL2x1_SUB - - jl .L1_4_27 - ALIGN_4 - - -.L1_4_29: - - SAVE2x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 2, KK -#endif - - addq $ 4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - - -/**************************************************************************/ -.L1_4_40: - testq $ 1, M - jz .L999 // to next 2 lines of N - - ALIGN_4 - -.L1_4_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 1, %rax // number of values in AO -#else - addq $ 1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L1_4_46 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_4_42: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - je .L1_4_46 - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - je .L1_4_46 - - jmp .L1_4_42 - ALIGN_4 - -.L1_4_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L1_4_49 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_4_47: - - KERNEL1x1_SUB - - jl .L1_4_47 - ALIGN_4 - - -.L1_4_49: - - SAVE1x1 - - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 1, KK -#endif - - addq $ 2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - -.L999: - vzeroupper - - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - vmovups 64(%rsp), %xmm6 - vmovups 80(%rsp), %xmm7 - vmovups 96(%rsp), %xmm8 - vmovups 112(%rsp), %xmm9 - vmovups 128(%rsp), %xmm10 - vmovups 144(%rsp), %xmm11 - vmovups 160(%rsp), %xmm12 - vmovups 176(%rsp), %xmm13 - vmovups 192(%rsp), %xmm14 - vmovups 208(%rsp), %xmm15 -#endif - - addq $ STACKSIZE, %rsp - ret - - EPILOGUE +/********************************************************************************* +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +/********************************************************************* +* 2014/07/29 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* 2013/10/28 Saar +* Parameter: +* CGEMM_DEFAULT_UNROLL_N 2 +* CGEMM_DEFAULT_UNROLL_M 8 +* CGEMM_DEFAULT_P 384 +* CGEMM_DEFAULT_Q 192 +* A_PR1 512 +* B_PR1 512 +* +* 2014/07/29 Saar +* Performance at 6912x6912x6912: +* 1 thread: 107 GFLOPS (SANDYBRIDGE: 60) (MKL: 86) +* 2 threads: 208 GFLOPS (SANDYBRIDGE: 114) (MKL: 155) +* 3 threads: 289 GFLOPS (SANDYBRIDGE: 162) (MKL: 222) +* 4 threads: 377 GFLOPS (SANDYBRIDGE: 223) (MKL: 279) +* +* +*********************************************************************/ + + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 320 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 8192 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA_R 48(%rsp) +#define ALPHA_I 56(%rsp) +#define OFFSET 64(%rsp) +#define KK 72(%rsp) +#define KKK 80(%rsp) +#define BUFFER1 128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $ 0, 4096 * 4(%rsp);\ + movl $ 0, 4096 * 3(%rsp);\ + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $ 0, 4096 * 3(%rsp);\ + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $ 0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + + +#if defined(BULLDOZER) + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + +#define VFMADDPS_R( y0,y1,y2 ) vfmaddps y0,y1,y2,y0 + +#define VFMADDPS_I( y0,y1,y2 ) vfmaddps y0,y1,y2,y0 + +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + +#define VFMADDPS_R( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0 + +#define VFMADDPS_I( y0,y1,y2 ) vfmaddps y0,y1,y2,y0 + +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + +#define VFMADDPS_R( y0,y1,y2 ) vfmaddps y0,y1,y2,y0 + +#define VFMADDPS_I( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0 + +#else + +#define VFMADDPS_R( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0 + +#define VFMADDPS_I( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0 + +#endif + +#else + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + +#define VFMADDPS_R( y0,y1,y2 ) vfmadd231ps y1,y2,y0 + +#define VFMADDPS_I( y0,y1,y2 ) vfmadd231ps y1,y2,y0 + +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + +#define VFMADDPS_R( y0,y1,y2 ) vfnmadd231ps y1,y2,y0 + +#define VFMADDPS_I( y0,y1,y2 ) vfmadd231ps y1,y2,y0 + +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + +#define VFMADDPS_R( y0,y1,y2 ) vfmadd231ps y1,y2,y0 + +#define VFMADDPS_I( y0,y1,y2 ) vfnmadd231ps y1,y2,y0 + +#else + +#define VFMADDPS_R( y0,y1,y2 ) vfnmadd231ps y1,y2,y0 + +#define VFMADDPS_I( y0,y1,y2 ) vfnmadd231ps y1,y2,y0 + +#endif + +#endif + + +#define A_PR1 512 +#define B_PR1 512 + + + +/***************************************************************************************************************************/ + +.macro KERNEL8x3_SUB + + vmovups -16 * SIZE(AO), %ymm0 + vmovups -8 * SIZE(AO), %ymm1 + vbroadcastss -8 * SIZE(BO), %ymm2 + vbroadcastss -7 * SIZE(BO), %ymm3 + prefetcht0 A_PR1(AO) + + VFMADDPS_R( %ymm8 ,%ymm2,%ymm0 ) + VFMADDPS_R( %ymm12,%ymm2,%ymm1 ) + VFMADDPS_I( %ymm9 ,%ymm3,%ymm0 ) + VFMADDPS_I( %ymm13,%ymm3,%ymm1 ) + + vbroadcastss -6 * SIZE(BO), %ymm2 + vbroadcastss -5 * SIZE(BO), %ymm3 + VFMADDPS_R( %ymm10,%ymm2,%ymm0 ) + VFMADDPS_R( %ymm14,%ymm2,%ymm1 ) + VFMADDPS_I( %ymm11,%ymm3,%ymm0 ) + VFMADDPS_I( %ymm15,%ymm3,%ymm1 ) + + vbroadcastss -4 * SIZE(BO), %ymm2 + vbroadcastss -3 * SIZE(BO), %ymm3 + VFMADDPS_R( %ymm4 ,%ymm2,%ymm0 ) + VFMADDPS_R( %ymm6 ,%ymm2,%ymm1 ) + VFMADDPS_I( %ymm5 ,%ymm3,%ymm0 ) + VFMADDPS_I( %ymm7 ,%ymm3,%ymm1 ) + + + addq $6*SIZE, BO + addq $16*SIZE, AO + decq %rax +.endm + +.macro SAVE8x3 + + vbroadcastss ALPHA_R, %ymm0 + vbroadcastss ALPHA_I, %ymm1 + + // swap high and low 64 bytes + vshufps $ 0xb1, %ymm9 , %ymm9 , %ymm9 + vshufps $ 0xb1, %ymm11, %ymm11, %ymm11 + vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 + vshufps $ 0xb1, %ymm15, %ymm15, %ymm15 + vshufps $ 0xb1, %ymm5 , %ymm5 , %ymm5 + vshufps $ 0xb1, %ymm7 , %ymm7 , %ymm7 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %ymm9, %ymm8 , %ymm8 + vaddsubps %ymm11,%ymm10, %ymm10 + vaddsubps %ymm13,%ymm12, %ymm12 + vaddsubps %ymm15,%ymm14, %ymm14 + vaddsubps %ymm5, %ymm4 , %ymm4 + vaddsubps %ymm7, %ymm6 , %ymm6 + + vshufps $ 0xb1, %ymm8 , %ymm8 , %ymm9 + vshufps $ 0xb1, %ymm10, %ymm10, %ymm11 + vshufps $ 0xb1, %ymm12, %ymm12, %ymm13 + vshufps $ 0xb1, %ymm14, %ymm14, %ymm15 + vshufps $ 0xb1, %ymm4 , %ymm4 , %ymm5 + vshufps $ 0xb1, %ymm6 , %ymm6 , %ymm7 + +#else + vaddsubps %ymm8, %ymm9 ,%ymm9 + vaddsubps %ymm10, %ymm11,%ymm11 + vaddsubps %ymm12, %ymm13,%ymm13 + vaddsubps %ymm14, %ymm15,%ymm15 + vaddsubps %ymm4, %ymm5 ,%ymm5 + vaddsubps %ymm6, %ymm7 ,%ymm7 + + vmovaps %ymm9, %ymm8 + vmovaps %ymm11, %ymm10 + vmovaps %ymm13, %ymm12 + vmovaps %ymm15, %ymm14 + vmovaps %ymm5, %ymm4 + vmovaps %ymm7, %ymm6 + + // swap high and low 64 bytes + vshufps $ 0xb1, %ymm9 , %ymm9 , %ymm9 + vshufps $ 0xb1, %ymm11, %ymm11, %ymm11 + vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 + vshufps $ 0xb1, %ymm15, %ymm15, %ymm15 + vshufps $ 0xb1, %ymm5 , %ymm5 , %ymm5 + vshufps $ 0xb1, %ymm7 , %ymm7 , %ymm7 + +#endif + + // multiply with ALPHA_R + vmulps %ymm8 , %ymm0, %ymm8 + vmulps %ymm10, %ymm0, %ymm10 + vmulps %ymm12, %ymm0, %ymm12 + vmulps %ymm14, %ymm0, %ymm14 + vmulps %ymm4 , %ymm0, %ymm4 + vmulps %ymm6 , %ymm0, %ymm6 + + // multiply with ALPHA_I + vmulps %ymm9 , %ymm1, %ymm9 + vmulps %ymm11, %ymm1, %ymm11 + vmulps %ymm13, %ymm1, %ymm13 + vmulps %ymm15, %ymm1, %ymm15 + vmulps %ymm5 , %ymm1, %ymm5 + vmulps %ymm7 , %ymm1, %ymm7 + + vaddsubps %ymm9, %ymm8 , %ymm8 + vaddsubps %ymm11,%ymm10, %ymm10 + vaddsubps %ymm13,%ymm12, %ymm12 + vaddsubps %ymm15,%ymm14, %ymm14 + vaddsubps %ymm5, %ymm4 , %ymm4 + vaddsubps %ymm7, %ymm6 , %ymm6 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm8 , %ymm8 + vaddps 8 * SIZE(CO1), %ymm12, %ymm12 + + vaddps (CO1, LDC), %ymm10, %ymm10 + vaddps 8 * SIZE(CO1, LDC), %ymm14, %ymm14 + + vaddps (CO1, LDC,2), %ymm4, %ymm4 + vaddps 8 * SIZE(CO1, LDC,2), %ymm6, %ymm6 + +#endif + + vmovups %ymm8 , (CO1) + vmovups %ymm12 , 8 * SIZE(CO1) + + vmovups %ymm10 , (CO1, LDC) + vmovups %ymm14 , 8 * SIZE(CO1, LDC) + + vmovups %ymm4 , (CO1, LDC,2) + vmovups %ymm6 , 8 * SIZE(CO1, LDC,2) + +.endm + + +/***************************************************************************************************************************/ + +.macro KERNEL4x3_SUB + + vmovups -16 * SIZE(AO), %ymm0 + vbroadcastss -8 * SIZE(BO), %ymm2 + vbroadcastss -7 * SIZE(BO), %ymm3 + + VFMADDPS_R( %ymm8 ,%ymm2,%ymm0 ) + VFMADDPS_I( %ymm9 ,%ymm3,%ymm0 ) + + vbroadcastss -6 * SIZE(BO), %ymm2 + vbroadcastss -5 * SIZE(BO), %ymm3 + VFMADDPS_R( %ymm12,%ymm2,%ymm0 ) + VFMADDPS_I( %ymm13,%ymm3,%ymm0 ) + + vbroadcastss -4 * SIZE(BO), %ymm2 + vbroadcastss -3 * SIZE(BO), %ymm3 + VFMADDPS_R( %ymm4 ,%ymm2,%ymm0 ) + VFMADDPS_I( %ymm5 ,%ymm3,%ymm0 ) + + addq $6*SIZE, BO + addq $8*SIZE, AO + decq %rax +.endm + +.macro SAVE4x3 + + vbroadcastss ALPHA_R, %ymm0 + vbroadcastss ALPHA_I, %ymm1 + + // swap high and low 64 bytes + vshufps $ 0xb1, %ymm9 , %ymm9 , %ymm9 + vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 + vshufps $ 0xb1, %ymm5 , %ymm5 , %ymm5 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %ymm9, %ymm8 , %ymm8 + vaddsubps %ymm13,%ymm12, %ymm12 + vaddsubps %ymm5, %ymm4 , %ymm4 + + vshufps $ 0xb1, %ymm8 , %ymm8 , %ymm9 + vshufps $ 0xb1, %ymm12, %ymm12, %ymm13 + vshufps $ 0xb1, %ymm4 , %ymm4 , %ymm5 + +#else + vaddsubps %ymm8, %ymm9 ,%ymm9 + vaddsubps %ymm12, %ymm13,%ymm13 + vaddsubps %ymm4, %ymm5 ,%ymm5 + + vmovaps %ymm9, %ymm8 + vmovaps %ymm13, %ymm12 + vmovaps %ymm5, %ymm4 + + // swap high and low 64 bytes + vshufps $ 0xb1, %ymm9 , %ymm9 , %ymm9 + vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 + vshufps $ 0xb1, %ymm5 , %ymm5 , %ymm5 + +#endif + + // multiply with ALPHA_R + vmulps %ymm8 , %ymm0, %ymm8 + vmulps %ymm12, %ymm0, %ymm12 + vmulps %ymm4 , %ymm0, %ymm4 + + // multiply with ALPHA_I + vmulps %ymm9 , %ymm1, %ymm9 + vmulps %ymm13, %ymm1, %ymm13 + vmulps %ymm5 , %ymm1, %ymm5 + + vaddsubps %ymm9, %ymm8 , %ymm8 + vaddsubps %ymm13,%ymm12, %ymm12 + vaddsubps %ymm5, %ymm4 , %ymm4 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm8 , %ymm8 + vaddps (CO1, LDC), %ymm12, %ymm12 + vaddps (CO1, LDC,2), %ymm4, %ymm4 + +#endif + + vmovups %ymm8 , (CO1) + vmovups %ymm12 , (CO1, LDC) + vmovups %ymm4 , (CO1, LDC,2) + +.endm + +/***************************************************************************************************************************/ + +.macro KERNEL2x3_SUB + + vmovups -16 * SIZE(AO), %xmm0 + vbroadcastss -8 * SIZE(BO), %xmm2 + vbroadcastss -7 * SIZE(BO), %xmm3 + + VFMADDPS_R( %xmm8 ,%xmm2,%xmm0 ) + VFMADDPS_I( %xmm9 ,%xmm3,%xmm0 ) + + vbroadcastss -6 * SIZE(BO), %xmm2 + vbroadcastss -5 * SIZE(BO), %xmm3 + VFMADDPS_R( %xmm12,%xmm2,%xmm0 ) + VFMADDPS_I( %xmm13,%xmm3,%xmm0 ) + + vbroadcastss -4 * SIZE(BO), %xmm2 + vbroadcastss -3 * SIZE(BO), %xmm3 + VFMADDPS_R( %xmm4 ,%xmm2,%xmm0 ) + VFMADDPS_I( %xmm5 ,%xmm3,%xmm0 ) + + addq $6*SIZE, BO + addq $4*SIZE, AO + decq %rax + +.endm + +.macro SAVE2x3 + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $ 0xb1, %xmm9 , %xmm9 , %xmm9 + vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 + vshufps $ 0xb1, %xmm5 , %xmm5 , %xmm5 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm13,%xmm12, %xmm12 + vaddsubps %xmm5, %xmm4 , %xmm4 + + vshufps $ 0xb1, %xmm8 , %xmm8 , %xmm9 + vshufps $ 0xb1, %xmm12, %xmm12, %xmm13 + vshufps $ 0xb1, %xmm4 , %xmm4 , %xmm5 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm12, %xmm13,%xmm13 + vaddsubps %xmm4, %xmm5 ,%xmm5 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm13, %xmm12 + vmovaps %xmm5, %xmm4 + + // swap high and low 64 bytes + vshufps $ 0xb1, %xmm9 , %xmm9 , %xmm9 + vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 + vshufps $ 0xb1, %xmm5 , %xmm5 , %xmm5 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm12, %xmm0, %xmm12 + vmulps %xmm4 , %xmm0, %xmm4 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm13, %xmm1, %xmm13 + vmulps %xmm5 , %xmm1, %xmm5 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm13,%xmm12, %xmm12 + vaddsubps %xmm5, %xmm4 , %xmm4 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm8 , %xmm8 + vaddps (CO1, LDC), %xmm12, %xmm12 + vaddps (CO1, LDC,2), %xmm4, %xmm4 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , (CO1, LDC) + vmovups %xmm4 , (CO1, LDC,2) + +.endm + + +/***************************************************************************************************************************/ + +.macro KERNEL1x3_SUB + + vmovsd -16 * SIZE(AO), %xmm0 + vbroadcastss -8 * SIZE(BO), %xmm2 + vbroadcastss -7 * SIZE(BO), %xmm3 + + VFMADDPS_R( %xmm8 ,%xmm2,%xmm0 ) + VFMADDPS_I( %xmm9 ,%xmm3,%xmm0 ) + + vbroadcastss -6 * SIZE(BO), %xmm2 + vbroadcastss -5 * SIZE(BO), %xmm3 + VFMADDPS_R( %xmm12,%xmm2,%xmm0 ) + VFMADDPS_I( %xmm13,%xmm3,%xmm0 ) + + vbroadcastss -4 * SIZE(BO), %xmm2 + vbroadcastss -3 * SIZE(BO), %xmm3 + VFMADDPS_R( %xmm4 ,%xmm2,%xmm0 ) + VFMADDPS_I( %xmm5 ,%xmm3,%xmm0 ) + + addq $6*SIZE, BO + addq $2*SIZE, AO + decq %rax + +.endm + +.macro SAVE1x3 + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $ 0xb1, %xmm9 , %xmm9 , %xmm9 + vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 + vshufps $ 0xb1, %xmm5 , %xmm5 , %xmm5 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm13,%xmm12, %xmm12 + vaddsubps %xmm5, %xmm4 , %xmm4 + + vshufps $ 0xb1, %xmm8 , %xmm8 , %xmm9 + vshufps $ 0xb1, %xmm12, %xmm12, %xmm13 + vshufps $ 0xb1, %xmm4 , %xmm4 , %xmm5 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm12, %xmm13,%xmm13 + vaddsubps %xmm4, %xmm5 ,%xmm5 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm13, %xmm12 + vmovaps %xmm5, %xmm4 + + // swap high and low 64 bytes + vshufps $ 0xb1, %xmm9 , %xmm9 , %xmm9 + vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 + vshufps $ 0xb1, %xmm5 , %xmm5 , %xmm5 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm12, %xmm0, %xmm12 + vmulps %xmm4 , %xmm0, %xmm4 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm13, %xmm1, %xmm13 + vmulps %xmm5 , %xmm1, %xmm5 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm13,%xmm12, %xmm12 + vaddsubps %xmm5, %xmm4 , %xmm4 + +#if !defined(TRMMKERNEL) + + vmovsd (CO1) , %xmm9 + vmovsd (CO1,LDC) , %xmm13 + vmovsd (CO1,LDC,2), %xmm5 + vaddps %xmm9 , %xmm8 , %xmm8 + vaddps %xmm13, %xmm12, %xmm12 + vaddps %xmm5 , %xmm4, %xmm4 + +#endif + + vmovsd %xmm8 , (CO1) + vmovsd %xmm12 , (CO1, LDC) + vmovsd %xmm4 , (CO1, LDC,2) + +.endm + + +/***************************************************************************************************************************/ + +.macro KERNEL8x2_SUB + + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vbroadcastss -8 * SIZE(BO, BI, SIZE), %ymm4 + VFMADDPS_R( %ymm8,%ymm4,%ymm0 ) + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 + VFMADDPS_R( %ymm12,%ymm4,%ymm1 ) + vbroadcastss -7 * SIZE(BO, BI, SIZE), %ymm5 + VFMADDPS_I( %ymm9,%ymm5,%ymm0 ) + VFMADDPS_I( %ymm13,%ymm5,%ymm1 ) + vbroadcastss -6 * SIZE(BO, BI, SIZE), %ymm6 + VFMADDPS_R( %ymm10,%ymm6,%ymm0 ) + VFMADDPS_R( %ymm14,%ymm6,%ymm1 ) + vbroadcastss -5 * SIZE(BO, BI, SIZE), %ymm7 + VFMADDPS_I( %ymm11,%ymm7,%ymm0 ) + VFMADDPS_I( %ymm15,%ymm7,%ymm1 ) + addq $ 4 , BI + addq $ 16, %rax +.endm + +.macro SAVE8x2 + + vbroadcastss ALPHA_R, %ymm0 + vbroadcastss ALPHA_I, %ymm1 + + // swap high and low 64 bytes + vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9 + vshufps $ 0xb1, %ymm11, %ymm11, %ymm11 + vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 + vshufps $ 0xb1, %ymm15, %ymm15, %ymm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %ymm9, %ymm8 , %ymm8 + vaddsubps %ymm11,%ymm10, %ymm10 + vaddsubps %ymm13,%ymm12, %ymm12 + vaddsubps %ymm15,%ymm14, %ymm14 + + vshufps $ 0xb1, %ymm8 , %ymm8, %ymm9 + vshufps $ 0xb1, %ymm10, %ymm10, %ymm11 + vshufps $ 0xb1, %ymm12, %ymm12, %ymm13 + vshufps $ 0xb1, %ymm14, %ymm14, %ymm15 + +#else + vaddsubps %ymm8, %ymm9 ,%ymm9 + vaddsubps %ymm10, %ymm11,%ymm11 + vaddsubps %ymm12, %ymm13,%ymm13 + vaddsubps %ymm14, %ymm15,%ymm15 + + vmovaps %ymm9, %ymm8 + vmovaps %ymm11, %ymm10 + vmovaps %ymm13, %ymm12 + vmovaps %ymm15, %ymm14 + + // swap high and low 64 bytes + vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9 + vshufps $ 0xb1, %ymm11, %ymm11, %ymm11 + vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 + vshufps $ 0xb1, %ymm15, %ymm15, %ymm15 + +#endif + + // multiply with ALPHA_R + vmulps %ymm8 , %ymm0, %ymm8 + vmulps %ymm10, %ymm0, %ymm10 + vmulps %ymm12, %ymm0, %ymm12 + vmulps %ymm14, %ymm0, %ymm14 + + // multiply with ALPHA_I + vmulps %ymm9 , %ymm1, %ymm9 + vmulps %ymm11, %ymm1, %ymm11 + vmulps %ymm13, %ymm1, %ymm13 + vmulps %ymm15, %ymm1, %ymm15 + + vaddsubps %ymm9, %ymm8 , %ymm8 + vaddsubps %ymm11,%ymm10, %ymm10 + vaddsubps %ymm13,%ymm12, %ymm12 + vaddsubps %ymm15,%ymm14, %ymm14 + + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm8 , %ymm8 + vaddps 8 * SIZE(CO1), %ymm12, %ymm12 + + vaddps (CO1, LDC), %ymm10, %ymm10 + vaddps 8 * SIZE(CO1, LDC), %ymm14, %ymm14 + +#endif + + vmovups %ymm8 , (CO1) + vmovups %ymm12 , 8 * SIZE(CO1) + + vmovups %ymm10 , (CO1, LDC) + vmovups %ymm14 , 8 * SIZE(CO1, LDC) + + prefetcht0 64(CO1) + prefetcht0 64(CO1, LDC) + +.endm + +/***************************************************************************************************************************/ + +.macro KERNEL4x2_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 + VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 + VFMADDPS_R( %xmm12,%xmm4,%xmm1 ) + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) + VFMADDPS_I( %xmm13,%xmm5,%xmm1 ) + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 + VFMADDPS_R( %xmm10,%xmm6,%xmm0 ) + VFMADDPS_R( %xmm14,%xmm6,%xmm1 ) + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 + VFMADDPS_I( %xmm11,%xmm7,%xmm0 ) + VFMADDPS_I( %xmm15,%xmm7,%xmm1 ) + addq $ 4, BI + addq $ 8, %rax +.endm + +.macro SAVE4x2 + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 + vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 + vshufps $ 0xb1, %xmm15, %xmm15, %xmm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + vaddsubps %xmm13,%xmm12, %xmm12 + vaddsubps %xmm15,%xmm14, %xmm14 + + vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $ 0xb1, %xmm10, %xmm10, %xmm11 + vshufps $ 0xb1, %xmm12, %xmm12, %xmm13 + vshufps $ 0xb1, %xmm14, %xmm14, %xmm15 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm10, %xmm11,%xmm11 + vaddsubps %xmm12, %xmm13,%xmm13 + vaddsubps %xmm14, %xmm15,%xmm15 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm11, %xmm10 + vmovaps %xmm13, %xmm12 + vmovaps %xmm15, %xmm14 + + // swap high and low 64 bytes + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 + vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 + vshufps $ 0xb1, %xmm15, %xmm15, %xmm15 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm10, %xmm0, %xmm10 + vmulps %xmm12, %xmm0, %xmm12 + vmulps %xmm14, %xmm0, %xmm14 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm11, %xmm1, %xmm11 + vmulps %xmm13, %xmm1, %xmm13 + vmulps %xmm15, %xmm1, %xmm15 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + vaddsubps %xmm13,%xmm12, %xmm12 + vaddsubps %xmm15,%xmm14, %xmm14 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm8 , %xmm8 + vaddps 4 * SIZE(CO1), %xmm12, %xmm12 + + vaddps (CO1, LDC), %xmm10, %xmm10 + vaddps 4 * SIZE(CO1, LDC), %xmm14, %xmm14 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 4 * SIZE(CO1) + + vmovups %xmm10 , (CO1, LDC) + vmovups %xmm14 , 4 * SIZE(CO1, LDC) + +.endm + +/************************************************************************************************/ + +.macro KERNEL2x2_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 + VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 + VFMADDPS_R( %xmm10,%xmm6,%xmm0 ) + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 + VFMADDPS_I( %xmm11,%xmm7,%xmm0 ) + addq $ 4, BI + addq $ 4, %rax +.endm + +.macro SAVE2x2 + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 4 bytes + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $ 0xb1, %xmm10, %xmm10, %xmm11 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm10, %xmm11,%xmm11 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm11, %xmm10 + + // swap high and low 4 bytes + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm10, %xmm0, %xmm10 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm11, %xmm1, %xmm11 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm8 , %xmm8 + + vaddps (CO1, LDC), %xmm10, %xmm10 + +#endif + + vmovups %xmm8 , (CO1) + + vmovups %xmm10 , (CO1, LDC) + +.endm + +/************************************************************************************************/ + +.macro KERNEL1x2_SUB + vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 + VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 + VFMADDPS_R( %xmm10,%xmm6,%xmm0 ) + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 + VFMADDPS_I( %xmm11,%xmm7,%xmm0 ) + addq $ 4, BI + addq $ 2, %rax +.endm + +.macro SAVE1x2 + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $ 0xb1, %xmm10, %xmm10, %xmm11 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm10, %xmm11,%xmm11 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm11, %xmm10 + + // swap high and low 64 bytes + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm10, %xmm0, %xmm10 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm11, %xmm1, %xmm11 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + +#if !defined(TRMMKERNEL) + + vmovsd (CO1), %xmm14 + vaddps %xmm14, %xmm8 , %xmm8 + + vmovsd (CO1, LDC), %xmm15 + vaddps %xmm15, %xmm10, %xmm10 + +#endif + + vmovsd %xmm8 , (CO1) + vmovsd %xmm10 , (CO1, LDC) + +.endm + +/************************************************************************************************/ + +.macro KERNEL8x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm4 + VFMADDPS_R( %ymm8,%ymm4,%ymm0 ) + VFMADDPS_R( %ymm12,%ymm4,%ymm1 ) + vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm5 + VFMADDPS_I( %ymm9,%ymm5,%ymm0 ) + VFMADDPS_I( %ymm13,%ymm5,%ymm1 ) + addq $ 2 , BI + addq $ 16, %rax +.endm + +.macro SAVE8x1 + + vbroadcastss ALPHA_R, %ymm0 + vbroadcastss ALPHA_I, %ymm1 + + // swap high and low 64 bytes + vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9 + vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %ymm9, %ymm8 , %ymm8 + vaddsubps %ymm13,%ymm12, %ymm12 + + vshufps $ 0xb1, %ymm8 , %ymm8, %ymm9 + vshufps $ 0xb1, %ymm12, %ymm12, %ymm13 + +#else + vaddsubps %ymm8, %ymm9 ,%ymm9 + vaddsubps %ymm12, %ymm13,%ymm13 + + vmovaps %ymm9, %ymm8 + vmovaps %ymm13, %ymm12 + + // swap high and low 64 bytes + vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9 + vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 + +#endif + + // multiply with ALPHA_R + vmulps %ymm8 , %ymm0, %ymm8 + vmulps %ymm12, %ymm0, %ymm12 + + // multiply with ALPHA_I + vmulps %ymm9 , %ymm1, %ymm9 + vmulps %ymm13, %ymm1, %ymm13 + + vaddsubps %ymm9, %ymm8 , %ymm8 + vaddsubps %ymm13,%ymm12, %ymm12 + + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm8 , %ymm8 + vaddps 8 * SIZE(CO1), %ymm12, %ymm12 + +#endif + + vmovups %ymm8 , (CO1) + vmovups %ymm12 , 8 * SIZE(CO1) + +.endm + + +/************************************************************************************************/ + +.macro KERNEL4x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 + VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 + VFMADDPS_R( %xmm12,%xmm4,%xmm1 ) + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) + VFMADDPS_I( %xmm13,%xmm5,%xmm1 ) + addq $ 2, BI + addq $ 8, %rax +.endm + +.macro SAVE4x1 + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 4 bytes + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm13,%xmm12, %xmm12 + + vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $ 0xb1, %xmm12, %xmm12, %xmm13 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm12, %xmm13,%xmm13 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm13, %xmm12 + + // swap high and low 4 bytes + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm12, %xmm0, %xmm12 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm13, %xmm1, %xmm13 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm13,%xmm12, %xmm12 + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + vaddps 4 * SIZE(CO1), %xmm12, %xmm12 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 4 * SIZE(CO1) + +.endm + +/************************************************************************************************/ + +.macro KERNEL2x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 + VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) + addq $ 2, BI + addq $ 4, %rax +.endm + +.macro SAVE2x1 + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + + vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + + vmovaps %xmm9, %xmm8 + + // swap high and low 64 bytes + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + + vaddsubps %xmm9, %xmm8 , %xmm8 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm8 , %xmm8 + +#endif + + vmovups %xmm8 , (CO1) + +.endm + +/************************************************************************************************/ + +.macro KERNEL1x1_SUB + vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 + VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) + addq $ 2, BI + addq $ 2, %rax +.endm + +.macro SAVE1x1 + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + + vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + + vmovaps %xmm9, %xmm8 + + // swap high and low 64 bytes + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + + vaddsubps %xmm9, %xmm8 , %xmm8 + +#if !defined(TRMMKERNEL) + + vmovsd (CO1), %xmm14 + vaddps %xmm14, %xmm8 , %xmm8 + +#endif + + vmovsd %xmm8 , (CO1) + +.endm + + +#if !defined(TRMMKERNEL) + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + vmovups %xmm6, 64(%rsp) + vmovups %xmm7, 80(%rsp) + vmovups %xmm8, 96(%rsp) + vmovups %xmm9, 112(%rsp) + vmovups %xmm10, 128(%rsp) + vmovups %xmm11, 144(%rsp) + vmovups %xmm12, 160(%rsp) + vmovups %xmm13, 176(%rsp) + vmovups %xmm14, 192(%rsp) + vmovups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + vmovaps %xmm3, %xmm0 + vmovsd OLD_ALPHA_I, %xmm1 + +#else + movq STACKSIZE + 8(%rsp), LDC + +#endif + + movq %rsp, SP # save old stack + subq $ 128 + L_BUFFER_SIZE, %rsp + andq $ -4096, %rsp # align stack + + STACK_TOUCH + + cmpq $ 0, OLD_M + je .L999 + + cmpq $ 0, OLD_N + je .L999 + + cmpq $ 0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovss %xmm0, ALPHA_R + vmovss %xmm1, ALPHA_I + + salq $ ZBASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $ 6, %rdi + divq %rdi // N / 6 + movq %rax, Ndiv6 // N / 6 + movq %rdx, Nmod6 // N % 6 + +/************************************************************************************************/ + +.L6_0: + + movq Ndiv6, J + cmpq $ 0, J + je .L2_00 + ALIGN_4 + + + +.L6_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + salq $2, %rax // 2 * COMPSIZE + leaq (B, %rax,4), BO2 + movq BO2, B // next offset of B + movq K, %rax + + ALIGN_4 + +.L6_02b: + + vmovups (BO1), %xmm0 + vmovsd (BO2), %xmm1 + vmovups %xmm0, (BO) + vmovsd %xmm1, 4*SIZE(BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO2 + addq $ 6*SIZE,BO + decq %rax + jnz .L6_02b + + +.L6_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + leaq (C, LDC, 1), C // c += 1 * ldc + + movq A, AO // aoffset = a + addq $ 16 * SIZE, AO + + movq M, I + sarq $ 3, I // i = (m >> 3) + je .L6_4_10 + + ALIGN_4 +/**********************************************************************************************************/ + +.L6_8_11: + + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L6_8_16 + + ALIGN_4 + +.L6_8_12: + + KERNEL8x3_SUB + KERNEL8x3_SUB + KERNEL8x3_SUB + KERNEL8x3_SUB + + KERNEL8x3_SUB + KERNEL8x3_SUB + KERNEL8x3_SUB + KERNEL8x3_SUB + + je .L6_8_16 + + KERNEL8x3_SUB + KERNEL8x3_SUB + KERNEL8x3_SUB + KERNEL8x3_SUB + + KERNEL8x3_SUB + KERNEL8x3_SUB + KERNEL8x3_SUB + KERNEL8x3_SUB + + je .L6_8_16 + + jmp .L6_8_12 + ALIGN_4 + +.L6_8_16: + movq K, %rax + + andq $ 7, %rax # if (k & 1) + je .L6_8_19 + + ALIGN_4 + +.L6_8_17: + + KERNEL8x3_SUB + + jnz .L6_8_17 + ALIGN_4 + + +.L6_8_19: + + SAVE8x3 + + addq $ 16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L6_8_11 + ALIGN_4 + + +/**********************************************************************************************************/ + + +.L6_4_10: + testq $ 7, M + jz .L6_4_60 // to next 2 lines of N + + testq $ 4, M + jz .L6_4_20 + ALIGN_4 + + +.L6_4_11: + + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L6_4_16 + + ALIGN_4 + +.L6_4_12: + + prefetcht0 A_PR1(AO) + KERNEL4x3_SUB + KERNEL4x3_SUB + prefetcht0 A_PR1(AO) + KERNEL4x3_SUB + KERNEL4x3_SUB + + prefetcht0 A_PR1(AO) + KERNEL4x3_SUB + KERNEL4x3_SUB + prefetcht0 A_PR1(AO) + KERNEL4x3_SUB + KERNEL4x3_SUB + + je .L6_4_16 + + prefetcht0 A_PR1(AO) + KERNEL4x3_SUB + KERNEL4x3_SUB + prefetcht0 A_PR1(AO) + KERNEL4x3_SUB + KERNEL4x3_SUB + + prefetcht0 A_PR1(AO) + KERNEL4x3_SUB + KERNEL4x3_SUB + prefetcht0 A_PR1(AO) + KERNEL4x3_SUB + KERNEL4x3_SUB + + je .L6_4_16 + + jmp .L6_4_12 + ALIGN_4 + +.L6_4_16: + movq K, %rax + + andq $ 7, %rax # if (k & 1) + je .L6_4_19 + + ALIGN_4 + +.L6_4_17: + + KERNEL4x3_SUB + + jnz .L6_4_17 + ALIGN_4 + + +.L6_4_19: + + SAVE4x3 + + addq $ 8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/************************************************************************** +* Rest of M +***************************************************************************/ + +.L6_4_20: + + testq $ 2, M + jz .L6_4_40 + ALIGN_4 + +.L6_4_21: + + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L6_4_26 + + ALIGN_4 + +.L6_4_22: + + prefetcht0 A_PR1(AO) + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + + prefetcht0 A_PR1(AO) + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + + je .L6_4_26 + + prefetcht0 A_PR1(AO) + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + + prefetcht0 A_PR1(AO) + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + + je .L6_4_26 + + jmp .L6_4_22 + ALIGN_4 + +.L6_4_26: + movq K, %rax + + andq $ 7, %rax # if (k & 1) + je .L6_4_29 + + ALIGN_4 + +.L6_4_27: + + KERNEL2x3_SUB + + jnz .L6_4_27 + ALIGN_4 + + +.L6_4_29: + + SAVE2x3 + + addq $ 4 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L6_4_21 + ALIGN_4 + + + +/**************************************************************************/ +.L6_4_40: + testq $ 1, M + jz .L6_4_60 // to next 2 lines of N + + ALIGN_4 + +.L6_4_41: + + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L6_4_46 + + ALIGN_4 + +.L6_4_42: + + prefetcht0 A_PR1(AO) + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + + je .L6_4_46 + + prefetcht0 A_PR1(AO) + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + + je .L6_4_46 + + jmp .L6_4_42 + ALIGN_4 + +.L6_4_46: + movq K, %rax + + andq $ 7, %rax # if (k & 1) + je .L6_4_49 + ALIGN_4 + +.L6_4_47: + + KERNEL1x3_SUB + + jnz .L6_4_47 + ALIGN_4 + + +.L6_4_49: + + SAVE1x3 + + addq $ 2 * SIZE, CO1 # coffset += 2 + decq I # i -- + jg .L6_4_41 + ALIGN_4 + + + + +.L6_4_60: + + +/*******************************************************************************************/ + +.L7_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + salq $2, %rax // 2 * COMPSIZE + leaq (B, %rax,4), BO2 + movq K, %rax + + ALIGN_4 + +.L7_02b: + + vmovsd 2*SIZE(BO1), %xmm0 + vmovups (BO2), %xmm1 + vmovsd %xmm0, (BO) + vmovups %xmm1, 2*SIZE(BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO2 + addq $ 6*SIZE,BO + decq %rax + jnz .L7_02b + + movq BO2, B // next offset of B + +.L7_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + leaq (C, LDC, 1), C // c += 1 * ldc + + movq A, AO // aoffset = a + addq $ 16 * SIZE, AO + + movq M, I + sarq $ 3, I // i = (m >> 3) + je .L7_4_10 + + ALIGN_4 +/**********************************************************************************************************/ + +.L7_8_11: + + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L7_8_16 + + ALIGN_4 + +.L7_8_12: + + KERNEL8x3_SUB + KERNEL8x3_SUB + KERNEL8x3_SUB + KERNEL8x3_SUB + + KERNEL8x3_SUB + KERNEL8x3_SUB + KERNEL8x3_SUB + KERNEL8x3_SUB + + je .L7_8_16 + + KERNEL8x3_SUB + KERNEL8x3_SUB + KERNEL8x3_SUB + KERNEL8x3_SUB + + KERNEL8x3_SUB + KERNEL8x3_SUB + KERNEL8x3_SUB + KERNEL8x3_SUB + + je .L7_8_16 + + jmp .L7_8_12 + ALIGN_4 + +.L7_8_16: + movq K, %rax + + andq $ 7, %rax # if (k & 1) + je .L7_8_19 + + ALIGN_4 + +.L7_8_17: + + KERNEL8x3_SUB + + jnz .L7_8_17 + ALIGN_4 + + +.L7_8_19: + + SAVE8x3 + + addq $ 16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L7_8_11 + ALIGN_4 + + +/**********************************************************************************************************/ + + +.L7_4_10: + testq $ 7, M + jz .L7_4_60 // to next 2 lines of N + + testq $ 4, M + jz .L7_4_20 + ALIGN_4 + + +.L7_4_11: + + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L7_4_16 + + ALIGN_4 + +.L7_4_12: + + prefetcht0 A_PR1(AO) + KERNEL4x3_SUB + KERNEL4x3_SUB + prefetcht0 A_PR1(AO) + KERNEL4x3_SUB + KERNEL4x3_SUB + + prefetcht0 A_PR1(AO) + KERNEL4x3_SUB + KERNEL4x3_SUB + prefetcht0 A_PR1(AO) + KERNEL4x3_SUB + KERNEL4x3_SUB + + je .L7_4_16 + + prefetcht0 A_PR1(AO) + KERNEL4x3_SUB + KERNEL4x3_SUB + prefetcht0 A_PR1(AO) + KERNEL4x3_SUB + KERNEL4x3_SUB + + prefetcht0 A_PR1(AO) + KERNEL4x3_SUB + KERNEL4x3_SUB + prefetcht0 A_PR1(AO) + KERNEL4x3_SUB + KERNEL4x3_SUB + + je .L7_4_16 + + jmp .L7_4_12 + ALIGN_4 + +.L7_4_16: + movq K, %rax + + andq $ 7, %rax # if (k & 1) + je .L7_4_19 + + ALIGN_4 + +.L7_4_17: + + KERNEL4x3_SUB + + jnz .L7_4_17 + ALIGN_4 + + +.L7_4_19: + + SAVE4x3 + + addq $ 8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/************************************************************************** +* Rest of M +***************************************************************************/ + +.L7_4_20: + + testq $ 2, M + jz .L7_4_40 + ALIGN_4 + +.L7_4_21: + + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L7_4_26 + + ALIGN_4 + +.L7_4_22: + + prefetcht0 A_PR1(AO) + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + + prefetcht0 A_PR1(AO) + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + + je .L7_4_26 + + prefetcht0 A_PR1(AO) + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + + prefetcht0 A_PR1(AO) + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + + je .L7_4_26 + + jmp .L7_4_22 + ALIGN_4 + +.L7_4_26: + movq K, %rax + + andq $ 7, %rax # if (k & 1) + je .L7_4_29 + + ALIGN_4 + +.L7_4_27: + + KERNEL2x3_SUB + + jnz .L7_4_27 + ALIGN_4 + + +.L7_4_29: + + SAVE2x3 + + addq $ 4 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L7_4_21 + ALIGN_4 + + + +/**************************************************************************/ +.L7_4_40: + testq $ 1, M + jz .L7_4_60 // to next 2 lines of N + + ALIGN_4 + +.L7_4_41: + + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L7_4_46 + + ALIGN_4 + +.L7_4_42: + + prefetcht0 A_PR1(AO) + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + + je .L7_4_46 + + prefetcht0 A_PR1(AO) + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + + je .L7_4_46 + + jmp .L7_4_42 + ALIGN_4 + +.L7_4_46: + movq K, %rax + + andq $ 7, %rax # if (k & 1) + je .L7_4_49 + ALIGN_4 + +.L7_4_47: + + KERNEL1x3_SUB + + jnz .L7_4_47 + ALIGN_4 + + +.L7_4_49: + + SAVE1x3 + + addq $ 2 * SIZE, CO1 # coffset += 2 + decq I # i -- + jg .L7_4_41 + ALIGN_4 + + + + +.L7_4_60: + + decq J // j -- + jg .L6_01 // next 6 lines of N + + + +/************************************************************************************************/ + +.L2_00: + + movq Nmod6, J + sarq $1, J // j = j / 2 + cmpq $ 0, J + je .L1_0 + ALIGN_4 + + + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $ 16 * SIZE, AO + + movq M, I + sarq $ 3, I // i = (m >> 3) + je .L2_4_10 + + ALIGN_4 +/**********************************************************************************************************/ + +.L2_8_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 4, %rax // rax = rax *16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 8, %rax // number of values in AO +#else + addq $ 2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L2_8_16 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 4, %rax // rax = rax *16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_8_12: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL8x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL8x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x2_SUB + + je .L2_8_16 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL8x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL8x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x2_SUB + + je .L2_8_16 + + jmp .L2_8_12 + ALIGN_4 + +.L2_8_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L2_8_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 4, %rax // rax = rax *16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_8_17: + + KERNEL8x2_SUB + + jl .L2_8_17 + ALIGN_4 + + +.L2_8_19: + + SAVE8x2 + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 4, %rax // rax = rax *16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 8, KK +#endif + + addq $ 16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_8_11 + ALIGN_4 + + +/**********************************************************************************************************/ + + + + +.L2_4_10: + testq $ 7, M + jz .L2_4_60 // to next 2 lines of N + + testq $ 4, M + jz .L2_4_20 + ALIGN_4 + + +.L2_4_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 4, %rax // number of values in AO +#else + addq $ 2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L2_4_16 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_12: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x2_SUB + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + KERNEL4x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x2_SUB + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + KERNEL4x2_SUB + + je .L2_4_16 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x2_SUB + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + KERNEL4x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x2_SUB + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + KERNEL4x2_SUB + + je .L2_4_16 + + jmp .L2_4_12 + ALIGN_4 + +.L2_4_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L2_4_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_17: + + KERNEL4x2_SUB + + jl .L2_4_17 + ALIGN_4 + + +.L2_4_19: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 4, KK +#endif + + addq $ 8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/************************************************************************** +* Rest of M +***************************************************************************/ + +.L2_4_20: + + testq $ 2, M + jz .L2_4_40 + ALIGN_4 + +.L2_4_21: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 2, %rax // number of values in AO +#else + addq $ 2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L2_4_26 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_22: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_4_26 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_4_26 + + jmp .L2_4_22 + ALIGN_4 + +.L2_4_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L2_4_29 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_27: + + KERNEL2x2_SUB + + jl .L2_4_27 + ALIGN_4 + + +.L2_4_29: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $ 0xb1, %xmm10, %xmm10, %xmm11 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm10, %xmm11,%xmm11 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm11, %xmm10 + + // swap high and low 64 bytes + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm10, %xmm0, %xmm10 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm11, %xmm1, %xmm11 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + + vaddps (CO1, LDC), %xmm10, %xmm10 + +#endif + + vmovups %xmm8 , (CO1) + + vmovups %xmm10 , (CO1, LDC) + + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 2, KK +#endif + + addq $ 4 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L2_4_21 + ALIGN_4 + + + +/**************************************************************************/ +.L2_4_40: + testq $ 1, M + jz .L2_4_60 // to next 2 lines of N + + ALIGN_4 + +.L2_4_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 1, %rax // number of values in AO +#else + addq $ 2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L2_4_46 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_42: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_4_46 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_4_46 + + jmp .L2_4_42 + ALIGN_4 + +.L2_4_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L2_4_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_47: + + KERNEL1x2_SUB + + jl .L2_4_47 + ALIGN_4 + + +.L2_4_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 1, KK +#endif + + addq $ 2 * SIZE, CO1 # coffset += 2 + decq I # i -- + jg .L2_4_41 + ALIGN_4 + + + + +.L2_4_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $ 2, KK +#endif + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $ 1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $ 2*SIZE,BO1 + addq $ 2*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $ 16 * SIZE, AO + + movq M, I + sarq $ 3, I // i = (m >> 3) + je .L1_4_10 + + ALIGN_4 + +/**************************************************************************************************/ + +.L1_8_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 4, %rax // rax = rax *16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 8, %rax // number of values in AO +#else + addq $ 1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L1_8_16 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 4, %rax // rax = rax *16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_8_12: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + + je .L1_8_16 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + + je .L1_8_16 + + jmp .L1_8_12 + ALIGN_4 + +.L1_8_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L1_8_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 4 ; number of values + + salq $ 4, %rax // rax = rax *16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_8_17: + + KERNEL8x1_SUB + + jl .L1_8_17 + ALIGN_4 + + +.L1_8_19: + + SAVE8x1 + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 4, %rax // rax = rax *16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 8, KK +#endif + + addq $ 16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_8_11 + ALIGN_4 + + + +/**************************************************************************************************/ +.L1_4_10: + + testq $ 7, M + jz .L999 + + testq $ 4, M + jz .L1_4_20 + + +.L1_4_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 4, %rax // number of values in AO +#else + addq $ 1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L1_4_16 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_12: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x1_SUB + KERNEL4x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x1_SUB + KERNEL4x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x1_SUB + KERNEL4x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_4_16 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x1_SUB + KERNEL4x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x1_SUB + KERNEL4x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x1_SUB + KERNEL4x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_4_16 + + jmp .L1_4_12 + ALIGN_4 + +.L1_4_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L1_4_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 4 ; number of values + + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_17: + + KERNEL4x1_SUB + + jl .L1_4_17 + ALIGN_4 + + +.L1_4_19: + + SAVE4x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 4, KK +#endif + + addq $ 8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/************************************************************************** +* Rest of M +***************************************************************************/ + +.L1_4_20: + + testq $ 2, M + jz .L1_4_40 + ALIGN_4 + +.L1_4_21: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 2, %rax // number of values in AO +#else + addq $ 1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L1_4_26 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_22: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_4_26 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_4_26 + + jmp .L1_4_22 + ALIGN_4 + +.L1_4_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L1_4_29 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2; number of values + + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_27: + + KERNEL2x1_SUB + + jl .L1_4_27 + ALIGN_4 + + +.L1_4_29: + + SAVE2x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 2, KK +#endif + + addq $ 4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + + +/**************************************************************************/ +.L1_4_40: + testq $ 1, M + jz .L999 // to next 2 lines of N + + ALIGN_4 + +.L1_4_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 1, %rax // number of values in AO +#else + addq $ 1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L1_4_46 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_42: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_4_46 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_4_46 + + jmp .L1_4_42 + ALIGN_4 + +.L1_4_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L1_4_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_47: + + KERNEL1x1_SUB + + jl .L1_4_47 + ALIGN_4 + + +.L1_4_49: + + SAVE1x1 + + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 1, KK +#endif + + addq $ 2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + +.L999: + vzeroupper + + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + vmovups 64(%rsp), %xmm6 + vmovups 80(%rsp), %xmm7 + vmovups 96(%rsp), %xmm8 + vmovups 112(%rsp), %xmm9 + vmovups 128(%rsp), %xmm10 + vmovups 144(%rsp), %xmm11 + vmovups 160(%rsp), %xmm12 + vmovups 176(%rsp), %xmm13 + vmovups 192(%rsp), %xmm14 + vmovups 208(%rsp), %xmm15 +#endif + + addq $ STACKSIZE, %rsp + ret + + EPILOGUE + +#else + +/************************************************************************************************/ + + + PROLOGUE + PROFCODE + + subq $ STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + vmovups %xmm6, 64(%rsp) + vmovups %xmm7, 80(%rsp) + vmovups %xmm8, 96(%rsp) + vmovups %xmm9, 112(%rsp) + vmovups %xmm10, 128(%rsp) + vmovups %xmm11, 144(%rsp) + vmovups %xmm12, 160(%rsp) + vmovups %xmm13, 176(%rsp) + vmovups %xmm14, 192(%rsp) + vmovups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + vmovsd OLD_ALPHA_I, %xmm1 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $ 128 + L_BUFFER_SIZE, %rsp + andq $ -4096, %rsp # align stack + + STACK_TOUCH + + cmpq $ 0, OLD_M + je .L999 + + cmpq $ 0, OLD_N + je .L999 + + cmpq $ 0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovss %xmm0, ALPHA_R + vmovss %xmm1, ALPHA_I + + salq $ ZBASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $ 2, %rdi + divq %rdi // N / 2 + movq %rax, Ndiv6 // N / 2 + movq %rdx, Nmod6 // N % 2 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + +.L2_0: + + movq Ndiv6, J + cmpq $ 0, J + je .L1_0 + ALIGN_4 + + + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $ 16 * SIZE, AO + + movq M, I + sarq $ 3, I // i = (m >> 3) + je .L2_4_10 + + ALIGN_4 +/**********************************************************************************************************/ + +.L2_8_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 4, %rax // rax = rax *16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 8, %rax // number of values in AO +#else + addq $ 2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L2_8_16 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 4, %rax // rax = rax *16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_8_12: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL8x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL8x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x2_SUB + + je .L2_8_16 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL8x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL8x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x2_SUB + + je .L2_8_16 + + jmp .L2_8_12 + ALIGN_4 + +.L2_8_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L2_8_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 4, %rax // rax = rax *16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_8_17: + + KERNEL8x2_SUB + + jl .L2_8_17 + ALIGN_4 + + +.L2_8_19: + + SAVE8x2 + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 4, %rax // rax = rax *16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 8, KK +#endif + + addq $ 16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_8_11 + ALIGN_4 + + +/**********************************************************************************************************/ + + + + +.L2_4_10: + testq $ 7, M + jz .L2_4_60 // to next 2 lines of N + + testq $ 4, M + jz .L2_4_20 + ALIGN_4 + + +.L2_4_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 4, %rax // number of values in AO +#else + addq $ 2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L2_4_16 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_12: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x2_SUB + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + KERNEL4x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x2_SUB + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + KERNEL4x2_SUB + + je .L2_4_16 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x2_SUB + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + KERNEL4x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x2_SUB + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + KERNEL4x2_SUB + + je .L2_4_16 + + jmp .L2_4_12 + ALIGN_4 + +.L2_4_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L2_4_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_17: + + KERNEL4x2_SUB + + jl .L2_4_17 + ALIGN_4 + + +.L2_4_19: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 4, KK +#endif + + addq $ 8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/************************************************************************** +* Rest of M +***************************************************************************/ + +.L2_4_20: + + testq $ 2, M + jz .L2_4_40 + ALIGN_4 + +.L2_4_21: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 2, %rax // number of values in AO +#else + addq $ 2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L2_4_26 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_22: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_4_26 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_4_26 + + jmp .L2_4_22 + ALIGN_4 + +.L2_4_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L2_4_29 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_27: + + KERNEL2x2_SUB + + jl .L2_4_27 + ALIGN_4 + + +.L2_4_29: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $ 0xb1, %xmm10, %xmm10, %xmm11 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm10, %xmm11,%xmm11 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm11, %xmm10 + + // swap high and low 64 bytes + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm10, %xmm0, %xmm10 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm11, %xmm1, %xmm11 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + + vaddps (CO1, LDC), %xmm10, %xmm10 + +#endif + + vmovups %xmm8 , (CO1) + + vmovups %xmm10 , (CO1, LDC) + + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 2, KK +#endif + + addq $ 4 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L2_4_21 + ALIGN_4 + + + +/**************************************************************************/ +.L2_4_40: + testq $ 1, M + jz .L2_4_60 // to next 2 lines of N + + ALIGN_4 + +.L2_4_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 1, %rax // number of values in AO +#else + addq $ 2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L2_4_46 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_42: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_4_46 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_4_46 + + jmp .L2_4_42 + ALIGN_4 + +.L2_4_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L2_4_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_47: + + KERNEL1x2_SUB + + jl .L2_4_47 + ALIGN_4 + + +.L2_4_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 1, KK +#endif + + addq $ 2 * SIZE, CO1 # coffset += 2 + decq I # i -- + jg .L2_4_41 + ALIGN_4 + + + + +.L2_4_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $ 2, KK +#endif + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $ 1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $ 2*SIZE,BO1 + addq $ 2*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $ 16 * SIZE, AO + + movq M, I + sarq $ 3, I // i = (m >> 3) + je .L1_4_10 + + ALIGN_4 + +/**************************************************************************************************/ + +.L1_8_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 4, %rax // rax = rax *16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 8, %rax // number of values in AO +#else + addq $ 1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L1_8_16 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 4, %rax // rax = rax *16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_8_12: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + + je .L1_8_16 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + + je .L1_8_16 + + jmp .L1_8_12 + ALIGN_4 + +.L1_8_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L1_8_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 4 ; number of values + + salq $ 4, %rax // rax = rax *16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_8_17: + + KERNEL8x1_SUB + + jl .L1_8_17 + ALIGN_4 + + +.L1_8_19: + + SAVE8x1 + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 4, %rax // rax = rax *16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 8, KK +#endif + + addq $ 16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_8_11 + ALIGN_4 + + + +/**************************************************************************************************/ +.L1_4_10: + + testq $ 7, M + jz .L999 + + testq $ 4, M + jz .L1_4_20 + + +.L1_4_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 4, %rax // number of values in AO +#else + addq $ 1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L1_4_16 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_12: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x1_SUB + KERNEL4x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x1_SUB + KERNEL4x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x1_SUB + KERNEL4x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_4_16 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x1_SUB + KERNEL4x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x1_SUB + KERNEL4x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x1_SUB + KERNEL4x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_4_16 + + jmp .L1_4_12 + ALIGN_4 + +.L1_4_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L1_4_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 4 ; number of values + + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_17: + + KERNEL4x1_SUB + + jl .L1_4_17 + ALIGN_4 + + +.L1_4_19: + + SAVE4x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 4, KK +#endif + + addq $ 8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/************************************************************************** +* Rest of M +***************************************************************************/ + +.L1_4_20: + + testq $ 2, M + jz .L1_4_40 + ALIGN_4 + +.L1_4_21: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 2, %rax // number of values in AO +#else + addq $ 1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L1_4_26 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_22: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_4_26 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_4_26 + + jmp .L1_4_22 + ALIGN_4 + +.L1_4_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L1_4_29 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2; number of values + + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_27: + + KERNEL2x1_SUB + + jl .L1_4_27 + ALIGN_4 + + +.L1_4_29: + + SAVE2x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 2, KK +#endif + + addq $ 4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + + +/**************************************************************************/ +.L1_4_40: + testq $ 1, M + jz .L999 // to next 2 lines of N + + ALIGN_4 + +.L1_4_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 1, %rax // number of values in AO +#else + addq $ 1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L1_4_46 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_42: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_4_46 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_4_46 + + jmp .L1_4_42 + ALIGN_4 + +.L1_4_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L1_4_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_47: + + KERNEL1x1_SUB + + jl .L1_4_47 + ALIGN_4 + + +.L1_4_49: + + SAVE1x1 + + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 1, KK +#endif + + addq $ 2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + +.L999: + vzeroupper + + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + vmovups 64(%rsp), %xmm6 + vmovups 80(%rsp), %xmm7 + vmovups 96(%rsp), %xmm8 + vmovups 112(%rsp), %xmm9 + vmovups 128(%rsp), %xmm10 + vmovups 144(%rsp), %xmm11 + vmovups 160(%rsp), %xmm12 + vmovups 176(%rsp), %xmm13 + vmovups 192(%rsp), %xmm14 + vmovups 208(%rsp), %xmm15 +#endif + + addq $ STACKSIZE, %rsp + ret + + EPILOGUE + + +#endif + -- 2.7.4