From: Werner Saar Date: Tue, 1 Mar 2016 06:33:56 +0000 (+0100) Subject: added dgemm-, dtrmm-, zgemm- and ztrmm-kernel for power8 X-Git-Tag: v0.2.16^2~1^2~15^2~8 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=b752858d6c37c0aa393c4a0636d3cda2ff2da179;p=platform%2Fupstream%2Fopenblas.git added dgemm-, dtrmm-, zgemm- and ztrmm-kernel for power8 --- diff --git a/common_power.h b/common_power.h index ab331b0..64e052f 100644 --- a/common_power.h +++ b/common_power.h @@ -236,7 +236,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define HAVE_PREFETCH #endif -#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) +#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) #define DCBT_ARG 0 #else #define DCBT_ARG 8 @@ -258,6 +258,13 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define L1_PREFETCH dcbtst #endif +#if defined(POWER8) +#define L1_DUALFETCH +#define L1_PREFETCHSIZE (16 + 128 * 100) +#define L1_PREFETCH dcbtst +#endif + +# #ifndef L1_PREFETCH #define L1_PREFETCH dcbt #endif diff --git a/cpuid_power.c b/cpuid_power.c index 6790076..951204a 100644 --- a/cpuid_power.c +++ b/cpuid_power.c @@ -66,7 +66,7 @@ char *cpuname[] = { "POWER6", "CELL", "PPCG4", - "POWER8", + "POWER8" }; char *lowercpuname[] = { @@ -78,7 +78,7 @@ char *lowercpuname[] = { "power6", "cell", "ppcg4", - "power8", + "power8" }; char *corename[] = { @@ -90,7 +90,7 @@ char *corename[] = { "POWER6", "CELL", "PPCG4", - "POWER8", + "POWER8" }; int detect(void){ diff --git a/getarch.c b/getarch.c index ff607a4..f9c49e6 100644 --- a/getarch.c +++ b/getarch.c @@ -552,7 +552,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "POWER5" #endif -#if defined(FORCE_POWER6) || defined(FORCE_POWER7) || defined(FORCE_POWER8) +#if defined(FORCE_POWER6) || defined(FORCE_POWER7) #define FORCE #define ARCHITECTURE "POWER" #define SUBARCHITECTURE "POWER6" @@ -565,7 +565,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "POWER6" #endif -#if defined(FORCE_POWER8) +#if defined(FORCE_POWER8) #define FORCE #define ARCHITECTURE "POWER" #define SUBARCHITECTURE "POWER8" @@ -578,6 +578,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "POWER8" #endif + #ifdef FORCE_PPCG4 #define FORCE #define ARCHITECTURE "POWER" diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 63e675b..8e68274 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -36,6 +36,11 @@ ifeq ($(CORE), HASWELL) USE_TRMM = 1 endif +ifeq ($(CORE), POWER8) +USE_TRMM = 1 +endif + + SKERNELOBJS += \ diff --git a/kernel/power/KERNEL b/kernel/power/KERNEL index cb9ed84..eae60cd 100644 --- a/kernel/power/KERNEL +++ b/kernel/power/KERNEL @@ -1,57 +1,3 @@ -SGEMM_BETA = gemm_beta.S -DGEMM_BETA = gemm_beta.S -CGEMM_BETA = zgemm_beta.S -ZGEMM_BETA = zgemm_beta.S - - -ifndef SSYMV_U_KERNEL -SSYMV_U_KERNEL = symv_U.S -endif - -ifndef SSYMV_L_KERNEL -SSYMV_L_KERNEL = symv_L.S -endif - -ifndef DSYMV_U_KERNEL -DSYMV_U_KERNEL = symv_U.S -endif - -ifndef DSYMV_L_KERNEL -DSYMV_L_KERNEL = symv_L.S -endif - -ifndef CSYMV_U_KERNEL -CSYMV_U_KERNEL = zsymv_U.S -endif - -ifndef CSYMV_L_KERNEL -CSYMV_L_KERNEL = zsymv_L.S -endif - -ifndef ZSYMV_U_KERNEL -ZSYMV_U_KERNEL = zsymv_U.S -endif - -ifndef ZSYMV_L_KERNEL -ZSYMV_L_KERNEL = zsymv_L.S -endif - -ifndef CHEMV_U_KERNEL -CHEMV_U_KERNEL = zsymv_U.S -endif - -ifndef CHEMV_L_KERNEL -CHEMV_L_KERNEL = zsymv_L.S -endif - -ifndef ZHEMV_U_KERNEL -ZHEMV_U_KERNEL = zsymv_U.S -endif - -ifndef ZHEMV_L_KERNEL -ZHEMV_L_KERNEL = zsymv_L.S -endif - ifndef STRSMKERNEL_LN STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c endif diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index 344b205..3a627e4 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -1,56 +1,173 @@ -SGEMMKERNEL = gemm_kernel_power6.S -SGEMMINCOPY = -SGEMMITCOPY = -SGEMMONCOPY = gemm_ncopy_4.S -SGEMMOTCOPY = gemm_tcopy_4.S -SGEMMINCOPYOBJ = -SGEMMITCOPYOBJ = -SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) -SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) -DGEMMKERNEL = gemm_kernel_power6.S -DGEMMINCOPY = -DGEMMITCOPY = +SGEMM_BETA = ../generic/gemm_beta.c +DGEMM_BETA = ../generic/gemm_beta.c +CGEMM_BETA = ../generic/zgemm_beta.c +ZGEMM_BETA = ../generic/zgemm_beta.c + +STRMMKERNEL = ../generic/trmmkernel_2x2.c +DTRMMKERNEL = dtrmm_kernel_16x4_power8.S +CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S + +SGEMMKERNEL = ../generic/gemmkernel_2x2.c +SGEMMONCOPY = ../generic/gemm_ncopy_2.c +SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +DGEMMKERNEL = dgemm_kernel_16x4_power8.S +DGEMMINCOPY = ../generic/gemm_ncopy_16.c +DGEMMITCOPY = ../generic/gemm_tcopy_16.c DGEMMONCOPY = gemm_ncopy_4.S DGEMMOTCOPY = gemm_tcopy_4.S -DGEMMINCOPYOBJ = -DGEMMITCOPYOBJ = -DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) -DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) -CGEMMKERNEL = zgemm_kernel_power6.S -CGEMMINCOPY = ../generic/zgemm_ncopy_2.c -CGEMMITCOPY = ../generic/zgemm_tcopy_2.c -CGEMMONCOPY = ../generic/zgemm_ncopy_4.c -CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c -CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) -CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) -CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) -CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) -ZGEMMKERNEL = zgemm_kernel_power6.S -ZGEMMINCOPY = ../generic/zgemm_ncopy_2.c -ZGEMMITCOPY = ../generic/zgemm_tcopy_2.c -ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c -ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) -ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) -ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) -ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) - -STRSMKERNEL_LN = trsm_kernel_power6_LN.S -STRSMKERNEL_LT = trsm_kernel_power6_LT.S -STRSMKERNEL_RN = trsm_kernel_power6_LT.S -STRSMKERNEL_RT = trsm_kernel_power6_RT.S - -DTRSMKERNEL_LN = trsm_kernel_power6_LN.S -DTRSMKERNEL_LT = trsm_kernel_power6_LT.S -DTRSMKERNEL_RN = trsm_kernel_power6_LT.S -DTRSMKERNEL_RT = trsm_kernel_power6_RT.S - -CTRSMKERNEL_LN = ztrsm_kernel_power6_LN.S -CTRSMKERNEL_LT = ztrsm_kernel_power6_LT.S -CTRSMKERNEL_RN = ztrsm_kernel_power6_LT.S -CTRSMKERNEL_RT = ztrsm_kernel_power6_RT.S - -ZTRSMKERNEL_LN = ztrsm_kernel_power6_LN.S -ZTRSMKERNEL_LT = ztrsm_kernel_power6_LT.S -ZTRSMKERNEL_RN = ztrsm_kernel_power6_LT.S -ZTRSMKERNEL_RT = ztrsm_kernel_power6_RT.S +DGEMMINCOPYOBJ = dgemm_incopy.o +DGEMMITCOPYOBJ = dgemm_itcopy.o +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMKERNEL = zgemm_kernel_8x2_power8.S +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c +ZGEMMITCOPY = ../generic/zgemm_tcopy_8.c +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o +ZGEMMINCOPYOBJ = zgemm_incopy.o +ZGEMMITCOPYOBJ = zgemm_itcopy.o + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +#Todo: CGEMM3MKERNEL should be 4x4 blocksizes. +CGEMM3MKERNEL = zgemm3m_kernel_8x4_sse3.S +ZGEMM3MKERNEL = zgemm3m_kernel_4x4_sse3.S + +#Pure C for other kernels +SAMAXKERNEL = ../arm/amax.c +DAMAXKERNEL = ../arm/amax.c +CAMAXKERNEL = ../arm/zamax.c +ZAMAXKERNEL = ../arm/zamax.c + +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMAXKERNEL = ../arm/iamax.c +IDAMAXKERNEL = ../arm/iamax.c +ICAMAXKERNEL = ../arm/izamax.c +IZAMAXKERNEL = ../arm/izamax.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +SASUMKERNEL = ../arm/asum.c +DASUMKERNEL = ../arm/asum.c +CASUMKERNEL = ../arm/zasum.c +ZASUMKERNEL = ../arm/zasum.c + +SAXPYKERNEL = ../arm/axpy.c +DAXPYKERNEL = ../arm/axpy.c +CAXPYKERNEL = ../arm/zaxpy.c +ZAXPYKERNEL = ../arm/zaxpy.c + +SCOPYKERNEL = ../arm/copy.c +DCOPYKERNEL = ../arm/copy.c +CCOPYKERNEL = ../arm/zcopy.c +ZCOPYKERNEL = ../arm/zcopy.c + +SDOTKERNEL = ../arm/dot.c +DDOTKERNEL = ../arm/dot.c +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c + +SNRM2KERNEL = ../arm/nrm2.c +DNRM2KERNEL = ../arm/nrm2.c +CNRM2KERNEL = ../arm/znrm2.c +ZNRM2KERNEL = ../arm/znrm2.c + +SROTKERNEL = ../arm/rot.c +DROTKERNEL = ../arm/rot.c +CROTKERNEL = ../arm/zrot.c +ZROTKERNEL = ../arm/zrot.c + +SSCALKERNEL = ../arm/scal.c +DSCALKERNEL = ../arm/scal.c +CSCALKERNEL = ../arm/zscal.c +ZSCALKERNEL = ../arm/zscal.c + +SSWAPKERNEL = ../arm/swap.c +DSWAPKERNEL = ../arm/swap.c +CSWAPKERNEL = ../arm/zswap.c +ZSWAPKERNEL = ../arm/zswap.c + +SGEMVNKERNEL = ../arm/gemv_n.c +DGEMVNKERNEL = ../arm/gemv_n.c +CGEMVNKERNEL = ../arm/zgemv_n.c +ZGEMVNKERNEL = ../arm/zgemv_n.c + +SGEMVTKERNEL = ../arm/gemv_t.c +DGEMVTKERNEL = ../arm/gemv_t.c +CGEMVTKERNEL = ../arm/zgemv_t.c +ZGEMVTKERNEL = ../arm/zgemv_t.c + +SSYMV_U_KERNEL = ../generic/symv_k.c +SSYMV_L_KERNEL = ../generic/symv_k.c +DSYMV_U_KERNEL = ../generic/symv_k.c +DSYMV_L_KERNEL = ../generic/symv_k.c +QSYMV_U_KERNEL = ../generic/symv_k.c +QSYMV_L_KERNEL = ../generic/symv_k.c +CSYMV_U_KERNEL = ../generic/zsymv_k.c +CSYMV_L_KERNEL = ../generic/zsymv_k.c +ZSYMV_U_KERNEL = ../generic/zsymv_k.c +ZSYMV_L_KERNEL = ../generic/zsymv_k.c +XSYMV_U_KERNEL = ../generic/zsymv_k.c +XSYMV_L_KERNEL = ../generic/zsymv_k.c + +ZHEMV_U_KERNEL = ../generic/zhemv_k.c +ZHEMV_L_KERNEL = ../generic/zhemv_k.c + +LSAME_KERNEL = ../generic/lsame.c +SCABS_KERNEL = ../generic/cabs.c +DCABS_KERNEL = ../generic/cabs.c +QCABS_KERNEL = ../generic/cabs.c + +#Dump kernel +CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c +ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c diff --git a/kernel/power/def_vsx.h b/kernel/power/def_vsx.h new file mode 100644 index 0000000..c2d29e2 --- /dev/null +++ b/kernel/power/def_vsx.h @@ -0,0 +1,64 @@ +#define vs0 0 +#define vs1 1 +#define vs2 2 +#define vs3 3 +#define vs4 4 +#define vs5 5 +#define vs6 6 +#define vs7 7 +#define vs8 8 +#define vs9 9 +#define vs10 10 +#define vs11 11 +#define vs12 12 +#define vs13 13 +#define vs14 14 +#define vs15 15 +#define vs16 16 +#define vs17 17 +#define vs18 18 +#define vs19 19 +#define vs20 20 +#define vs21 21 +#define vs22 22 +#define vs23 23 +#define vs24 24 +#define vs25 25 +#define vs26 26 +#define vs27 27 +#define vs28 28 +#define vs29 29 +#define vs30 30 +#define vs31 31 +#define vs32 32 +#define vs33 33 +#define vs34 34 +#define vs35 35 +#define vs36 36 +#define vs37 37 +#define vs38 38 +#define vs39 39 +#define vs40 40 +#define vs41 41 +#define vs42 42 +#define vs43 43 +#define vs44 44 +#define vs45 45 +#define vs46 46 +#define vs47 47 +#define vs48 48 +#define vs49 49 +#define vs50 50 +#define vs51 51 +#define vs52 52 +#define vs53 53 +#define vs54 54 +#define vs55 55 +#define vs56 56 +#define vs57 57 +#define vs58 58 +#define vs59 59 +#define vs60 60 +#define vs61 61 +#define vs62 62 +#define vs63 63 diff --git a/kernel/power/dgemm_kernel_16x4_power8.S b/kernel/power/dgemm_kernel_16x4_power8.S new file mode 100644 index 0000000..53205ad --- /dev/null +++ b/kernel/power/dgemm_kernel_16x4_power8.S @@ -0,0 +1,313 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA_SP 296(SP) +#define FZERO 304(SP) +#else +#define STACKSIZE 240 +#define ALPHA_SP 224(SP) +#define FZERO 232(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r8 +#define B r9 +#define C r10 +#define LDC r7 +#define OFFSET r6 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#define alpha_r vs18 + +#define o0 0 + +#define o8 r15 +#define o24 r16 +#define ALPHA r17 +#define L r18 +#define T1 r19 +#define KK r20 +#define BB r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define o16 r27 +#define o32 r28 +#define o48 r29 + +#define PRE r30 +#define T2 r31 + +#include "dgemm_macros_16x4_power8.S" + + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) + stw r19, 192(SP) + stw r18, 196(SP) + stw r17, 200(SP) + stw r16, 204(SP) + stw r15, 208(SP) +#endif + + stfd f1, ALPHA_SP + stw r0, FZERO + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) +#endif +#endif + + slwi LDC, LDC, BASE_SHIFT + +#if defined(TRMMKERNEL) +#if defined(linux) && defined(__64BIT__) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) +#else + lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) +#endif +#endif +#endif +#endif + + + cmpwi cr0, M, 0 + ble L999_H1 + cmpwi cr0, N, 0 + ble L999_H1 + cmpwi cr0, K, 0 + ble L999_H1 + +#ifdef __64BIT__ + addi ALPHA, SP, 296 +#else + addi ALPHA, SP, 224 +#endif + + li PRE, 256 + li o8 , 8 + li o16, 16 + li o24, 24 + li o32, 32 + li o48, 48 + + lxvdsx alpha_r, 0, ALPHA + +#include "dgemm_logic_16x4_power8.S" + +L999: + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) + lwz r19, 192(SP) + lwz r18, 196(SP) + lwz r17, 200(SP) + lwz r16, 204(SP) + lwz r15, 208(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/dgemm_logic_16x4_power8.S b/kernel/power/dgemm_logic_16x4_power8.S new file mode 100644 index 0000000..e19f78b --- /dev/null +++ b/kernel/power/dgemm_logic_16x4_power8.S @@ -0,0 +1,1647 @@ + srawi. J, N, 2 + ble DGEMM_L4_END + +DGEMM_L4_BEGIN: + + mr CO, C + mr AO, A + slwi T1, LDC , 2 + add C, C, T1 + srawi. I, M, 4 + ble DGEMM_L4x16_END + +DGEMM_L4x16_BEGIN: + + + mr BO, B + srawi. L, K, 3 + ble DGEMM_L4x16_SUB0 + cmpwi cr0, L, 1 + ble DGEMM_L4x16_SUB4 + +DGEMM_L4x16_LOOP_START: + + dcbt AO, PRE + LOAD4x16_1 + dcbt AO, PRE + KERNEL4x16_I1 + dcbt AO, PRE + KERNEL4x16_2 + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + addic. L, L, -2 + ble DGEMM_L4x16_LOOP_END + + .align 5 + +DGEMM_L4x16_LOOP: + + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + addic. L, L, -1 + bgt DGEMM_L4x16_LOOP + +DGEMM_L4x16_LOOP_END: + + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + dcbt AO, PRE + KERNEL4x16_1 + KERNEL4x16_E2 + + b DGEMM_L4x16_SUB1 + +DGEMM_L4x16_SUB4: + + dcbt AO, PRE + KERNEL4x16_SUBI1 + dcbt AO, PRE + KERNEL4x16_SUB1 + dcbt AO, PRE + KERNEL4x16_SUB1 + dcbt AO, PRE + KERNEL4x16_SUB1 + + KERNEL4x16_SUB1 + KERNEL4x16_SUB1 + KERNEL4x16_SUB1 + KERNEL4x16_SUB1 + + b DGEMM_L4x16_SUB1 + +DGEMM_L4x16_SUB0: + + andi. L, K, 7 + + KERNEL4x16_SUBI1 + + addic. L, L, -1 + ble DGEMM_L4x16_SAVE + b DGEMM_L4x16_SUB2 + +DGEMM_L4x16_SUB1: + + andi. L, K, 7 + ble DGEMM_L4x16_SAVE + +DGEMM_L4x16_SUB2: + + KERNEL4x16_SUB1 + + addic. L, L, -1 + bgt DGEMM_L4x16_SUB2 + +DGEMM_L4x16_SAVE: + + SAVE4x16 + + addic. I, I, -1 + bgt DGEMM_L4x16_BEGIN + +DGEMM_L4x16_END: + +DGEMM_L4x8_BEGIN: + + andi. T2, M, 15 + ble DGEMM_L4x1_END + + andi. T1, M, 8 + ble DGEMM_L4x8_END + mr BO, B + srawi. L, K, 3 + ble DGEMM_L4x8_SUB0 + cmpwi cr0, L, 1 + ble DGEMM_L4x8_SUB4 + +DGEMM_L4x8_LOOP_START: + + LOAD4x8_1 + KERNEL4x8_I1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + addic. L, L, -2 + ble DGEMM_L4x8_LOOP_END + + .align 5 + +DGEMM_L4x8_LOOP: + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + addic. L, L, -1 + bgt DGEMM_L4x8_LOOP + +DGEMM_L4x8_LOOP_END: + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_E2 + + b DGEMM_L4x8_SUB1 + +DGEMM_L4x8_SUB4: + + KERNEL4x8_SUBI1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + + b DGEMM_L4x8_SUB1 + +DGEMM_L4x8_SUB0: + + andi. L, K, 7 + + KERNEL4x8_SUBI1 + + addic. L, L, -1 + ble DGEMM_L4x8_SAVE + b DGEMM_L4x8_SUB2 + +DGEMM_L4x8_SUB1: + + andi. L, K, 7 + ble DGEMM_L4x8_SAVE + +DGEMM_L4x8_SUB2: + + KERNEL4x8_SUB1 + + addic. L, L, -1 + bgt DGEMM_L4x8_SUB2 + +DGEMM_L4x8_SAVE: + + SAVE4x8 + +DGEMM_L4x8_END: + +DGEMM_L4x4_BEGIN: + + + andi. T1, M, 4 + ble DGEMM_L4x4_END + mr BO, B + srawi. L, K, 3 + ble DGEMM_L4x4_SUB0 + cmpwi cr0, L, 1 + ble DGEMM_L4x4_SUB4 + +DGEMM_L4x4_LOOP_START: + + LOAD4x4_1 + KERNEL4x4_I1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + addic. L, L, -2 + ble DGEMM_L4x4_LOOP_END + + .align 5 + +DGEMM_L4x4_LOOP: + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + addic. L, L, -1 + bgt DGEMM_L4x4_LOOP + +DGEMM_L4x4_LOOP_END: + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_E2 + + b DGEMM_L4x4_SUB1 + +DGEMM_L4x4_SUB4: + + KERNEL4x4_SUBI1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + + b DGEMM_L4x4_SUB1 + +DGEMM_L4x4_SUB0: + + andi. L, K, 7 + + KERNEL4x4_SUBI1 + + addic. L, L, -1 + ble DGEMM_L4x4_SAVE + b DGEMM_L4x4_SUB2 + +DGEMM_L4x4_SUB1: + + andi. L, K, 7 + ble DGEMM_L4x4_SAVE + +DGEMM_L4x4_SUB2: + + KERNEL4x4_SUB1 + + addic. L, L, -1 + bgt DGEMM_L4x4_SUB2 + +DGEMM_L4x4_SAVE: + + SAVE4x4 + +DGEMM_L4x4_END: + +DGEMM_L4x2_BEGIN: + + + andi. T1, M, 2 + ble DGEMM_L4x2_END + mr BO, B + srawi. L, K, 3 + ble DGEMM_L4x2_SUB0 + cmpwi cr0, L, 1 + ble DGEMM_L4x2_SUB4 + +DGEMM_L4x2_LOOP_START: + + LOAD4x2_1 + KERNEL4x2_I1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + addic. L, L, -2 + ble DGEMM_L4x2_LOOP_END + + .align 5 + +DGEMM_L4x2_LOOP: + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + addic. L, L, -1 + bgt DGEMM_L4x2_LOOP + +DGEMM_L4x2_LOOP_END: + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_E2 + + b DGEMM_L4x2_SUB1 + +DGEMM_L4x2_SUB4: + + KERNEL4x2_SUBI1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + + b DGEMM_L4x2_SUB1 + +DGEMM_L4x2_SUB0: + + andi. L, K, 7 + + KERNEL4x2_SUBI1 + + addic. L, L, -1 + ble DGEMM_L4x2_SAVE + b DGEMM_L4x2_SUB2 + +DGEMM_L4x2_SUB1: + + andi. L, K, 7 + ble DGEMM_L4x2_SAVE + +DGEMM_L4x2_SUB2: + + KERNEL4x2_SUB1 + + addic. L, L, -1 + bgt DGEMM_L4x2_SUB2 + +DGEMM_L4x2_SAVE: + + SAVE4x2 + +DGEMM_L4x2_END: + +DGEMM_L4x1_BEGIN: + + + andi. T1, M, 1 + ble DGEMM_L4x1_END + mr BO, B + srawi. L, K, 3 + ble DGEMM_L4x1_SUB0 + cmpwi cr0, L, 1 + ble DGEMM_L4x1_SUB4 + +DGEMM_L4x1_LOOP_START: + + LOAD4x1_1 + KERNEL4x1_I1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + addic. L, L, -2 + ble DGEMM_L4x1_LOOP_END + + .align 5 + +DGEMM_L4x1_LOOP: + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + addic. L, L, -1 + bgt DGEMM_L4x1_LOOP + +DGEMM_L4x1_LOOP_END: + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_E2 + + b DGEMM_L4x1_SUB1 + +DGEMM_L4x1_SUB4: + + KERNEL4x1_SUBI1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + + b DGEMM_L4x1_SUB1 + +DGEMM_L4x1_SUB0: + + andi. L, K, 7 + + KERNEL4x1_SUBI1 + + addic. L, L, -1 + ble DGEMM_L4x1_SAVE + b DGEMM_L4x1_SUB2 + +DGEMM_L4x1_SUB1: + + andi. L, K, 7 + ble DGEMM_L4x1_SAVE + +DGEMM_L4x1_SUB2: + + KERNEL4x1_SUB1 + + addic. L, L, -1 + bgt DGEMM_L4x1_SUB2 + +DGEMM_L4x1_SAVE: + + SAVE4x1 + +DGEMM_L4x1_END: + + slwi T1, K, 5 + add B, B, T1 + + addic. J, J, -1 + bgt DGEMM_L4_BEGIN + + andi. T2, N, 3 + ble L999 + +DGEMM_L4_END: + + b DGEMM_L2_BEGIN + +L999_H1: + + b L999 + +DGEMM_L2_BEGIN: + + andi. T1, N, 2 + ble DGEMM_L2_END + mr CO, C + mr AO, A + slwi T1, LDC , 1 + add C, C, T1 + srawi. I, M, 4 + ble DGEMM_L2x16_END + +DGEMM_L2x16_BEGIN: + + + mr BO, B + srawi. L, K, 3 + ble DGEMM_L2x16_SUB0 + cmpwi cr0, L, 1 + ble DGEMM_L2x16_SUB4 + +DGEMM_L2x16_LOOP_START: + + dcbt AO, PRE + LOAD2x16_1 + dcbt AO, PRE + KERNEL2x16_I1 + dcbt AO, PRE + KERNEL2x16_2 + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + addic. L, L, -2 + ble DGEMM_L2x16_LOOP_END + + .align 5 + +DGEMM_L2x16_LOOP: + + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + addic. L, L, -1 + bgt DGEMM_L2x16_LOOP + +DGEMM_L2x16_LOOP_END: + + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + dcbt AO, PRE + KERNEL2x16_1 + KERNEL2x16_E2 + + b DGEMM_L2x16_SUB1 + +DGEMM_L2x16_SUB4: + + dcbt AO, PRE + KERNEL2x16_SUBI1 + dcbt AO, PRE + KERNEL2x16_SUB1 + dcbt AO, PRE + KERNEL2x16_SUB1 + dcbt AO, PRE + KERNEL2x16_SUB1 + + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + + b DGEMM_L2x16_SUB1 + +DGEMM_L2x16_SUB0: + + andi. L, K, 7 + + KERNEL2x16_SUBI1 + + addic. L, L, -1 + ble DGEMM_L2x16_SAVE + b DGEMM_L2x16_SUB2 + +DGEMM_L2x16_SUB1: + + andi. L, K, 7 + ble DGEMM_L2x16_SAVE + +DGEMM_L2x16_SUB2: + + KERNEL2x16_SUB1 + + addic. L, L, -1 + bgt DGEMM_L2x16_SUB2 + +DGEMM_L2x16_SAVE: + + SAVE2x16 + + addic. I, I, -1 + bgt DGEMM_L2x16_BEGIN + +DGEMM_L2x16_END: + +DGEMM_L2x8_BEGIN: + + andi. T2, M, 15 + ble DGEMM_L2x1_END + + andi. T1, M, 8 + ble DGEMM_L2x8_END + mr BO, B + srawi. L, K, 3 + ble DGEMM_L2x8_SUB0 + cmpwi cr0, L, 1 + ble DGEMM_L2x8_SUB4 + +DGEMM_L2x8_LOOP_START: + + LOAD2x8_1 + KERNEL2x8_I1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + addic. L, L, -2 + ble DGEMM_L2x8_LOOP_END + + .align 5 + +DGEMM_L2x8_LOOP: + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + addic. L, L, -1 + bgt DGEMM_L2x8_LOOP + +DGEMM_L2x8_LOOP_END: + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_E2 + + b DGEMM_L2x8_SUB1 + +DGEMM_L2x8_SUB4: + + KERNEL2x8_SUBI1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + + b DGEMM_L2x8_SUB1 + +DGEMM_L2x8_SUB0: + + andi. L, K, 7 + + KERNEL2x8_SUBI1 + + addic. L, L, -1 + ble DGEMM_L2x8_SAVE + b DGEMM_L2x8_SUB2 + +DGEMM_L2x8_SUB1: + + andi. L, K, 7 + ble DGEMM_L2x8_SAVE + +DGEMM_L2x8_SUB2: + + KERNEL2x8_SUB1 + + addic. L, L, -1 + bgt DGEMM_L2x8_SUB2 + +DGEMM_L2x8_SAVE: + + SAVE2x8 + +DGEMM_L2x8_END: + +DGEMM_L2x4_BEGIN: + + + andi. T1, M, 4 + ble DGEMM_L2x4_END + mr BO, B + srawi. L, K, 3 + ble DGEMM_L2x4_SUB0 + cmpwi cr0, L, 1 + ble DGEMM_L2x4_SUB4 + +DGEMM_L2x4_LOOP_START: + + LOAD2x4_1 + KERNEL2x4_I1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -2 + ble DGEMM_L2x4_LOOP_END + + .align 5 + +DGEMM_L2x4_LOOP: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -1 + bgt DGEMM_L2x4_LOOP + +DGEMM_L2x4_LOOP_END: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_E2 + + b DGEMM_L2x4_SUB1 + +DGEMM_L2x4_SUB4: + + KERNEL2x4_SUBI1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + b DGEMM_L2x4_SUB1 + +DGEMM_L2x4_SUB0: + + andi. L, K, 7 + + KERNEL2x4_SUBI1 + + addic. L, L, -1 + ble DGEMM_L2x4_SAVE + b DGEMM_L2x4_SUB2 + +DGEMM_L2x4_SUB1: + + andi. L, K, 7 + ble DGEMM_L2x4_SAVE + +DGEMM_L2x4_SUB2: + + KERNEL2x4_SUB1 + + addic. L, L, -1 + bgt DGEMM_L2x4_SUB2 + +DGEMM_L2x4_SAVE: + + SAVE2x4 + +DGEMM_L2x4_END: + +DGEMM_L2x2_BEGIN: + + + andi. T1, M, 2 + ble DGEMM_L2x2_END + mr BO, B + srawi. L, K, 3 + ble DGEMM_L2x2_SUB0 + cmpwi cr0, L, 1 + ble DGEMM_L2x2_SUB4 + +DGEMM_L2x2_LOOP_START: + + LOAD2x2_1 + KERNEL2x2_I1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -2 + ble DGEMM_L2x2_LOOP_END + + .align 5 + +DGEMM_L2x2_LOOP: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -1 + bgt DGEMM_L2x2_LOOP + +DGEMM_L2x2_LOOP_END: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_E2 + + b DGEMM_L2x2_SUB1 + +DGEMM_L2x2_SUB4: + + KERNEL2x2_SUBI1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + b DGEMM_L2x2_SUB1 + +DGEMM_L2x2_SUB0: + + andi. L, K, 7 + + KERNEL2x2_SUBI1 + + addic. L, L, -1 + ble DGEMM_L2x2_SAVE + b DGEMM_L2x2_SUB2 + +DGEMM_L2x2_SUB1: + + andi. L, K, 7 + ble DGEMM_L2x2_SAVE + +DGEMM_L2x2_SUB2: + + KERNEL2x2_SUB1 + + addic. L, L, -1 + bgt DGEMM_L2x2_SUB2 + +DGEMM_L2x2_SAVE: + + SAVE2x2 + +DGEMM_L2x2_END: + +DGEMM_L2x1_BEGIN: + + + andi. T1, M, 1 + ble DGEMM_L2x1_END + mr BO, B + srawi. L, K, 3 + ble DGEMM_L2x1_SUB0 + cmpwi cr0, L, 1 + ble DGEMM_L2x1_SUB4 + +DGEMM_L2x1_LOOP_START: + + LOAD2x1_1 + KERNEL2x1_I1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -2 + ble DGEMM_L2x1_LOOP_END + + .align 5 + +DGEMM_L2x1_LOOP: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -1 + bgt DGEMM_L2x1_LOOP + +DGEMM_L2x1_LOOP_END: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_E2 + + b DGEMM_L2x1_SUB1 + +DGEMM_L2x1_SUB4: + + KERNEL2x1_SUBI1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + b DGEMM_L2x1_SUB1 + +DGEMM_L2x1_SUB0: + + andi. L, K, 7 + + KERNEL2x1_SUBI1 + + addic. L, L, -1 + ble DGEMM_L2x1_SAVE + b DGEMM_L2x1_SUB2 + +DGEMM_L2x1_SUB1: + + andi. L, K, 7 + ble DGEMM_L2x1_SAVE + +DGEMM_L2x1_SUB2: + + KERNEL2x1_SUB1 + + addic. L, L, -1 + bgt DGEMM_L2x1_SUB2 + +DGEMM_L2x1_SAVE: + + SAVE2x1 + +DGEMM_L2x1_END: + + slwi T1, K, 4 + add B, B, T1 + +DGEMM_L2_END: +DGEMM_L1_BEGIN: + + andi. T1, N, 1 + ble DGEMM_L1_END + mr CO, C + mr AO, A + srawi. I, M, 4 + ble DGEMM_L1x16_END + +DGEMM_L1x16_BEGIN: + + + mr BO, B + srawi. L, K, 3 + ble DGEMM_L1x16_SUB0 + cmpwi cr0, L, 1 + ble DGEMM_L1x16_SUB4 + +DGEMM_L1x16_LOOP_START: + + dcbt AO, PRE + LOAD1x16_1 + dcbt AO, PRE + KERNEL1x16_I1 + dcbt AO, PRE + KERNEL1x16_2 + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + addic. L, L, -2 + ble DGEMM_L1x16_LOOP_END + + .align 5 + +DGEMM_L1x16_LOOP: + + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + addic. L, L, -1 + bgt DGEMM_L1x16_LOOP + +DGEMM_L1x16_LOOP_END: + + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + dcbt AO, PRE + KERNEL1x16_1 + KERNEL1x16_E2 + + b DGEMM_L1x16_SUB1 + +DGEMM_L1x16_SUB4: + + dcbt AO, PRE + KERNEL1x16_SUBI1 + dcbt AO, PRE + KERNEL1x16_SUB1 + dcbt AO, PRE + KERNEL1x16_SUB1 + dcbt AO, PRE + KERNEL1x16_SUB1 + + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + + b DGEMM_L1x16_SUB1 + +DGEMM_L1x16_SUB0: + + andi. L, K, 7 + + KERNEL1x16_SUBI1 + + addic. L, L, -1 + ble DGEMM_L1x16_SAVE + b DGEMM_L1x16_SUB2 + +DGEMM_L1x16_SUB1: + + andi. L, K, 7 + ble DGEMM_L1x16_SAVE + +DGEMM_L1x16_SUB2: + + KERNEL1x16_SUB1 + + addic. L, L, -1 + bgt DGEMM_L1x16_SUB2 + +DGEMM_L1x16_SAVE: + + SAVE1x16 + + addic. I, I, -1 + bgt DGEMM_L1x16_BEGIN + +DGEMM_L1x16_END: + +DGEMM_L1x8_BEGIN: + + andi. T2, M, 15 + ble DGEMM_L1x1_END + + andi. T1, M, 8 + ble DGEMM_L1x8_END + mr BO, B + srawi. L, K, 3 + ble DGEMM_L1x8_SUB0 + cmpwi cr0, L, 1 + ble DGEMM_L1x8_SUB4 + +DGEMM_L1x8_LOOP_START: + + LOAD1x8_1 + KERNEL1x8_I1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + addic. L, L, -2 + ble DGEMM_L1x8_LOOP_END + + .align 5 + +DGEMM_L1x8_LOOP: + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + addic. L, L, -1 + bgt DGEMM_L1x8_LOOP + +DGEMM_L1x8_LOOP_END: + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_E2 + + b DGEMM_L1x8_SUB1 + +DGEMM_L1x8_SUB4: + + KERNEL1x8_SUBI1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + + b DGEMM_L1x8_SUB1 + +DGEMM_L1x8_SUB0: + + andi. L, K, 7 + + KERNEL1x8_SUBI1 + + addic. L, L, -1 + ble DGEMM_L1x8_SAVE + b DGEMM_L1x8_SUB2 + +DGEMM_L1x8_SUB1: + + andi. L, K, 7 + ble DGEMM_L1x8_SAVE + +DGEMM_L1x8_SUB2: + + KERNEL1x8_SUB1 + + addic. L, L, -1 + bgt DGEMM_L1x8_SUB2 + +DGEMM_L1x8_SAVE: + + SAVE1x8 + +DGEMM_L1x8_END: + +DGEMM_L1x4_BEGIN: + + + andi. T1, M, 4 + ble DGEMM_L1x4_END + mr BO, B + srawi. L, K, 3 + ble DGEMM_L1x4_SUB0 + cmpwi cr0, L, 1 + ble DGEMM_L1x4_SUB4 + +DGEMM_L1x4_LOOP_START: + + LOAD1x4_1 + KERNEL1x4_I1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -2 + ble DGEMM_L1x4_LOOP_END + + .align 5 + +DGEMM_L1x4_LOOP: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -1 + bgt DGEMM_L1x4_LOOP + +DGEMM_L1x4_LOOP_END: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_E2 + + b DGEMM_L1x4_SUB1 + +DGEMM_L1x4_SUB4: + + KERNEL1x4_SUBI1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + b DGEMM_L1x4_SUB1 + +DGEMM_L1x4_SUB0: + + andi. L, K, 7 + + KERNEL1x4_SUBI1 + + addic. L, L, -1 + ble DGEMM_L1x4_SAVE + b DGEMM_L1x4_SUB2 + +DGEMM_L1x4_SUB1: + + andi. L, K, 7 + ble DGEMM_L1x4_SAVE + +DGEMM_L1x4_SUB2: + + KERNEL1x4_SUB1 + + addic. L, L, -1 + bgt DGEMM_L1x4_SUB2 + +DGEMM_L1x4_SAVE: + + SAVE1x4 + +DGEMM_L1x4_END: + +DGEMM_L1x2_BEGIN: + + + andi. T1, M, 2 + ble DGEMM_L1x2_END + mr BO, B + srawi. L, K, 3 + ble DGEMM_L1x2_SUB0 + cmpwi cr0, L, 1 + ble DGEMM_L1x2_SUB4 + +DGEMM_L1x2_LOOP_START: + + LOAD1x2_1 + KERNEL1x2_I1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -2 + ble DGEMM_L1x2_LOOP_END + + .align 5 + +DGEMM_L1x2_LOOP: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -1 + bgt DGEMM_L1x2_LOOP + +DGEMM_L1x2_LOOP_END: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_E2 + + b DGEMM_L1x2_SUB1 + +DGEMM_L1x2_SUB4: + + KERNEL1x2_SUBI1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + b DGEMM_L1x2_SUB1 + +DGEMM_L1x2_SUB0: + + andi. L, K, 7 + + KERNEL1x2_SUBI1 + + addic. L, L, -1 + ble DGEMM_L1x2_SAVE + b DGEMM_L1x2_SUB2 + +DGEMM_L1x2_SUB1: + + andi. L, K, 7 + ble DGEMM_L1x2_SAVE + +DGEMM_L1x2_SUB2: + + KERNEL1x2_SUB1 + + addic. L, L, -1 + bgt DGEMM_L1x2_SUB2 + +DGEMM_L1x2_SAVE: + + SAVE1x2 + +DGEMM_L1x2_END: + +DGEMM_L1x1_BEGIN: + + + andi. T1, M, 1 + ble DGEMM_L1x1_END + mr BO, B + srawi. L, K, 3 + ble DGEMM_L1x1_SUB0 + cmpwi cr0, L, 1 + ble DGEMM_L1x1_SUB4 + +DGEMM_L1x1_LOOP_START: + + LOAD1x1_1 + KERNEL1x1_I1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -2 + ble DGEMM_L1x1_LOOP_END + + .align 5 + +DGEMM_L1x1_LOOP: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -1 + bgt DGEMM_L1x1_LOOP + +DGEMM_L1x1_LOOP_END: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_E2 + + b DGEMM_L1x1_SUB1 + +DGEMM_L1x1_SUB4: + + KERNEL1x1_SUBI1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + b DGEMM_L1x1_SUB1 + +DGEMM_L1x1_SUB0: + + andi. L, K, 7 + + KERNEL1x1_SUBI1 + + addic. L, L, -1 + ble DGEMM_L1x1_SAVE + b DGEMM_L1x1_SUB2 + +DGEMM_L1x1_SUB1: + + andi. L, K, 7 + ble DGEMM_L1x1_SAVE + +DGEMM_L1x1_SUB2: + + KERNEL1x1_SUB1 + + addic. L, L, -1 + bgt DGEMM_L1x1_SUB2 + +DGEMM_L1x1_SAVE: + + SAVE1x1 + +DGEMM_L1x1_END: + +DGEMM_L1_END: diff --git a/kernel/power/dgemm_macros_16x4_power8.S b/kernel/power/dgemm_macros_16x4_power8.S new file mode 100644 index 0000000..d409098 --- /dev/null +++ b/kernel/power/dgemm_macros_16x4_power8.S @@ -0,0 +1,3400 @@ +/********************************************************************* +* Macros for N=4, M=16 * +*********************************************************************/ + +.macro LOAD4x16_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 64 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x16_I1 + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + xvmuldp vs44, vs4, vs25 + xvmuldp vs45, vs5, vs25 + xvmuldp vs46, vs6, vs25 + xvmuldp vs47, vs7, vs25 + + addi AO, AO, 64 + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + xvmuldp vs50, vs2, vs26 + xvmuldp vs51, vs3, vs26 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + + xvmuldp vs52, vs4, vs26 + xvmuldp vs53, vs5, vs26 + xvmuldp vs54, vs6, vs26 + xvmuldp vs55, vs7, vs26 + + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + xvmuldp vs58, vs2, vs27 + xvmuldp vs59, vs3, vs27 + + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + xvmuldp vs60, vs4, vs27 + xvmuldp vs61, vs5, vs27 + xvmuldp vs62, vs6, vs27 + xvmuldp vs63, vs7, vs27 + + addi AO, AO, 64 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x16_1 + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + xvmaddadp vs44, vs4, vs25 + xvmaddadp vs45, vs5, vs25 + xvmaddadp vs46, vs6, vs25 + xvmaddadp vs47, vs7, vs25 + + addi AO, AO, 64 + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + xvmaddadp vs50, vs2, vs26 + xvmaddadp vs51, vs3, vs26 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + + xvmaddadp vs52, vs4, vs26 + xvmaddadp vs53, vs5, vs26 + xvmaddadp vs54, vs6, vs26 + xvmaddadp vs55, vs7, vs26 + + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + xvmaddadp vs58, vs2, vs27 + xvmaddadp vs59, vs3, vs27 + + + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + xvmaddadp vs60, vs4, vs27 + xvmaddadp vs61, vs5, vs27 + xvmaddadp vs62, vs6, vs27 + xvmaddadp vs63, vs7, vs27 + + addi AO, AO, 64 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x16_2 + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + xvmaddadp vs44, vs12, vs29 + xvmaddadp vs45, vs13, vs29 + xvmaddadp vs46, vs14, vs29 + xvmaddadp vs47, vs15, vs29 + + addi AO, AO, 64 + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + xvmaddadp vs50, vs10, vs30 + xvmaddadp vs51, vs11, vs30 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + + xvmaddadp vs52, vs12, vs30 + xvmaddadp vs53, vs13, vs30 + xvmaddadp vs54, vs14, vs30 + xvmaddadp vs55, vs15, vs30 + + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + xvmaddadp vs58, vs10, vs31 + xvmaddadp vs59, vs11, vs31 + + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + xvmaddadp vs60, vs12, vs31 + xvmaddadp vs61, vs13, vs31 + xvmaddadp vs62, vs14, vs31 + xvmaddadp vs63, vs15, vs31 + + addi AO, AO, 64 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x16_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + xvmaddadp vs44, vs12, vs29 + xvmaddadp vs45, vs13, vs29 + xvmaddadp vs46, vs14, vs29 + xvmaddadp vs47, vs15, vs29 + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + xvmaddadp vs50, vs10, vs30 + xvmaddadp vs51, vs11, vs30 + xvmaddadp vs52, vs12, vs30 + xvmaddadp vs53, vs13, vs30 + xvmaddadp vs54, vs14, vs30 + xvmaddadp vs55, vs15, vs30 + + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + xvmaddadp vs58, vs10, vs31 + xvmaddadp vs59, vs11, vs31 + xvmaddadp vs60, vs12, vs31 + xvmaddadp vs61, vs13, vs31 + xvmaddadp vs62, vs14, vs31 + xvmaddadp vs63, vs15, vs31 + +.endm + +.macro KERNEL4x16_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 64 + addi BO, BO, 32 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + xvmuldp vs44, vs4, vs25 + xvmuldp vs45, vs5, vs25 + xvmuldp vs46, vs6, vs25 + xvmuldp vs47, vs7, vs25 + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + xvmuldp vs50, vs2, vs26 + xvmuldp vs51, vs3, vs26 + xvmuldp vs52, vs4, vs26 + xvmuldp vs53, vs5, vs26 + xvmuldp vs54, vs6, vs26 + xvmuldp vs55, vs7, vs26 + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + xvmuldp vs58, vs2, vs27 + xvmuldp vs59, vs3, vs27 + xvmuldp vs60, vs4, vs27 + xvmuldp vs61, vs5, vs27 + xvmuldp vs62, vs6, vs27 + xvmuldp vs63, vs7, vs27 + +.endm + +.macro KERNEL4x16_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 64 + addi BO, BO, 32 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + xvmaddadp vs44, vs4, vs25 + xvmaddadp vs45, vs5, vs25 + xvmaddadp vs46, vs6, vs25 + xvmaddadp vs47, vs7, vs25 + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + xvmaddadp vs50, vs2, vs26 + xvmaddadp vs51, vs3, vs26 + xvmaddadp vs52, vs4, vs26 + xvmaddadp vs53, vs5, vs26 + xvmaddadp vs54, vs6, vs26 + xvmaddadp vs55, vs7, vs26 + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + xvmaddadp vs58, vs2, vs27 + xvmaddadp vs59, vs3, vs27 + xvmaddadp vs60, vs4, vs27 + xvmaddadp vs61, vs5, vs27 + xvmaddadp vs62, vs6, vs27 + xvmaddadp vs63, vs7, vs27 + +.endm + +.macro SAVE4x16 + + mr T1, CO + addi T2, T1, 64 + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 + + lxvd2x vs4, 0, T2 + lxvd2x vs5, o16, T2 + lxvd2x vs6, o32, T2 + lxvd2x vs7, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r + xvmaddadp vs4, vs36, alpha_r + xvmaddadp vs5, vs37, alpha_r + xvmaddadp vs6, vs38, alpha_r + xvmaddadp vs7, vs39, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r + xvmuldp vs4, vs36, alpha_r + xvmuldp vs5, vs37, alpha_r + xvmuldp vs6, vs38, alpha_r + xvmuldp vs7, vs39, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + dcbt T1, PRE + + stxvd2x vs4, 0, T2 + stxvd2x vs5, o16, T2 + stxvd2x vs6, o32, T2 + stxvd2x vs7, o48, T2 + + add T1, T1, LDC + add T2, T2, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 + lxvd2x vs10, o32, T1 + lxvd2x vs11, o48, T1 + + lxvd2x vs12, 0, T2 + lxvd2x vs13, o16, T2 + lxvd2x vs14, o32, T2 + lxvd2x vs15, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r + xvmaddadp vs10, vs42, alpha_r + xvmaddadp vs11, vs43, alpha_r + xvmaddadp vs12, vs44, alpha_r + xvmaddadp vs13, vs45, alpha_r + xvmaddadp vs14, vs46, alpha_r + xvmaddadp vs15, vs47, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r + xvmuldp vs10, vs42, alpha_r + xvmuldp vs11, vs43, alpha_r + xvmuldp vs12, vs44, alpha_r + xvmuldp vs13, vs45, alpha_r + xvmuldp vs14, vs46, alpha_r + xvmuldp vs15, vs47, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + dcbt T1, PRE + + stxvd2x vs12, 0, T2 + stxvd2x vs13, o16, T2 + stxvd2x vs14, o32, T2 + stxvd2x vs15, o48, T2 + + add T1, T1, LDC + add T2, T2, LDC + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 + + lxvd2x vs4, 0, T2 + lxvd2x vs5, o16, T2 + lxvd2x vs6, o32, T2 + lxvd2x vs7, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs48, alpha_r + xvmaddadp vs1, vs49, alpha_r + xvmaddadp vs2, vs50, alpha_r + xvmaddadp vs3, vs51, alpha_r + xvmaddadp vs4, vs52, alpha_r + xvmaddadp vs5, vs53, alpha_r + xvmaddadp vs6, vs54, alpha_r + xvmaddadp vs7, vs55, alpha_r +#else + xvmuldp vs0, vs48, alpha_r + xvmuldp vs1, vs49, alpha_r + xvmuldp vs2, vs50, alpha_r + xvmuldp vs3, vs51, alpha_r + xvmuldp vs4, vs52, alpha_r + xvmuldp vs5, vs53, alpha_r + xvmuldp vs6, vs54, alpha_r + xvmuldp vs7, vs55, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + dcbt T1, PRE + + stxvd2x vs4, 0, T2 + stxvd2x vs5, o16, T2 + stxvd2x vs6, o32, T2 + stxvd2x vs7, o48, T2 + + add T1, T1, LDC + add T2, T2, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 + lxvd2x vs10, o32, T1 + lxvd2x vs11, o48, T1 + + lxvd2x vs12, 0, T2 + lxvd2x vs13, o16, T2 + lxvd2x vs14, o32, T2 + lxvd2x vs15, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs56, alpha_r + xvmaddadp vs9, vs57, alpha_r + xvmaddadp vs10, vs58, alpha_r + xvmaddadp vs11, vs59, alpha_r + xvmaddadp vs12, vs60, alpha_r + xvmaddadp vs13, vs61, alpha_r + xvmaddadp vs14, vs62, alpha_r + xvmaddadp vs15, vs63, alpha_r +#else + xvmuldp vs8, vs56, alpha_r + xvmuldp vs9, vs57, alpha_r + xvmuldp vs10, vs58, alpha_r + xvmuldp vs11, vs59, alpha_r + xvmuldp vs12, vs60, alpha_r + xvmuldp vs13, vs61, alpha_r + xvmuldp vs14, vs62, alpha_r + xvmuldp vs15, vs63, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + dcbt T1, PRE + + stxvd2x vs12, 0, T2 + stxvd2x vs13, o16, T2 + stxvd2x vs14, o32, T2 + stxvd2x vs15, o48, T2 + + addi CO, CO, 128 + +.endm + +/********************************************************************* +* Macros for N=4, M=8 * +*********************************************************************/ + +.macro LOAD4x8_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 64 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x8_I1 + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + xvmuldp vs50, vs2, vs26 + xvmuldp vs51, vs3, vs26 + + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + xvmuldp vs58, vs2, vs27 + xvmuldp vs59, vs3, vs27 + + addi AO, AO, 64 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x8_1 + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + xvmaddadp vs50, vs2, vs26 + xvmaddadp vs51, vs3, vs26 + + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + xvmaddadp vs58, vs2, vs27 + xvmaddadp vs59, vs3, vs27 + + addi AO, AO, 64 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x8_2 + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + xvmaddadp vs50, vs10, vs30 + xvmaddadp vs51, vs11, vs30 + + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + xvmaddadp vs58, vs10, vs31 + xvmaddadp vs59, vs11, vs31 + + addi AO, AO, 64 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x8_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + xvmaddadp vs50, vs10, vs30 + xvmaddadp vs51, vs11, vs30 + + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + xvmaddadp vs58, vs10, vs31 + xvmaddadp vs59, vs11, vs31 + +.endm + +.macro KERNEL4x8_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 64 + addi BO, BO, 32 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + xvmuldp vs50, vs2, vs26 + xvmuldp vs51, vs3, vs26 + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + xvmuldp vs58, vs2, vs27 + xvmuldp vs59, vs3, vs27 + +.endm + +.macro KERNEL4x8_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 64 + addi BO, BO, 32 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + xvmaddadp vs50, vs2, vs26 + xvmaddadp vs51, vs3, vs26 + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + xvmaddadp vs58, vs2, vs27 + xvmaddadp vs59, vs3, vs27 + +.endm + +.macro SAVE4x8 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 + lxvd2x vs10, o32, T1 + lxvd2x vs11, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r + xvmaddadp vs10, vs42, alpha_r + xvmaddadp vs11, vs43, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r + xvmuldp vs10, vs42, alpha_r + xvmuldp vs11, vs43, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs48, alpha_r + xvmaddadp vs1, vs49, alpha_r + xvmaddadp vs2, vs50, alpha_r + xvmaddadp vs3, vs51, alpha_r +#else + xvmuldp vs0, vs48, alpha_r + xvmuldp vs1, vs49, alpha_r + xvmuldp vs2, vs50, alpha_r + xvmuldp vs3, vs51, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 + lxvd2x vs10, o32, T1 + lxvd2x vs11, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs56, alpha_r + xvmaddadp vs9, vs57, alpha_r + xvmaddadp vs10, vs58, alpha_r + xvmaddadp vs11, vs59, alpha_r +#else + xvmuldp vs8, vs56, alpha_r + xvmuldp vs9, vs57, alpha_r + xvmuldp vs10, vs58, alpha_r + xvmuldp vs11, vs59, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + addi CO, CO, 64 + +.endm + +/********************************************************************* +* Macros for N=4, M=4 * +*********************************************************************/ + +.macro LOAD4x4_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x4_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + +.endm + +.macro KERNEL4x4_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + +.endm + +.macro KERNEL4x4_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + +.endm + +.macro KERNEL4x4_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + +.endm + +.macro KERNEL4x4_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + +.endm + +.macro KERNEL4x4_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + +.endm + +.macro SAVE4x4 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs48, alpha_r + xvmaddadp vs1, vs49, alpha_r +#else + xvmuldp vs0, vs48, alpha_r + xvmuldp vs1, vs49, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs56, alpha_r + xvmaddadp vs9, vs57, alpha_r +#else + xvmuldp vs8, vs56, alpha_r + xvmuldp vs9, vs57, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + + addi CO, CO, 32 + +.endm + +/********************************************************************* +* Macros for N=4, M=2 * +*********************************************************************/ + +.macro LOAD4x2_1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x2_I1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmuldp vs32, vs0, vs24 + + xvmuldp vs40, vs0, vs25 + + xvmuldp vs48, vs0, vs26 + + xvmuldp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x2_1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmaddadp vs32, vs0, vs24 + + xvmaddadp vs40, vs0, vs25 + + xvmaddadp vs48, vs0, vs26 + + xvmaddadp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x2_2 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmaddadp vs32, vs8, vs28 + + xvmaddadp vs40, vs8, vs29 + + xvmaddadp vs48, vs8, vs30 + + xvmaddadp vs56, vs8, vs31 + +.endm + +.macro KERNEL4x2_E2 + + + xvmaddadp vs32, vs8, vs28 + + xvmaddadp vs40, vs8, vs29 + + xvmaddadp vs48, vs8, vs30 + + xvmaddadp vs56, vs8, vs31 + +.endm + +.macro KERNEL4x2_SUBI1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmuldp vs32, vs0, vs24 + + xvmuldp vs40, vs0, vs25 + + xvmuldp vs48, vs0, vs26 + + xvmuldp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x2_SUB1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmaddadp vs32, vs0, vs24 + + xvmaddadp vs40, vs0, vs25 + + xvmaddadp vs48, vs0, vs26 + + xvmaddadp vs56, vs0, vs27 + +.endm + +.macro SAVE4x2 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r +#else + xvmuldp vs0, vs32, alpha_r +#endif + + stxvd2x vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r +#else + xvmuldp vs8, vs40, alpha_r +#endif + + stxvd2x vs8, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs48, alpha_r +#else + xvmuldp vs0, vs48, alpha_r +#endif + + stxvd2x vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs56, alpha_r +#else + xvmuldp vs8, vs56, alpha_r +#endif + + stxvd2x vs8, 0, T1 + + addi CO, CO, 16 + +.endm + +/********************************************************************* +* Macros for N=4, M=1 * +*********************************************************************/ + +.macro LOAD4x1_1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + lxsdx vs26, o16, BO + lxsdx vs27, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x1_I1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + lxsdx vs29, o8, BO + lxsdx vs30, o16, BO + lxsdx vs31, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmuldp vs32, vs0, vs24 + + xsmuldp vs40, vs0, vs25 + + xsmuldp vs48, vs0, vs26 + + xsmuldp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x1_1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + lxsdx vs29, o8, BO + lxsdx vs30, o16, BO + lxsdx vs31, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmaddadp vs32, vs0, vs24 + + xsmaddadp vs40, vs0, vs25 + + xsmaddadp vs48, vs0, vs26 + + xsmaddadp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x1_2 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + lxsdx vs26, o16, BO + lxsdx vs27, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmaddadp vs32, vs8, vs28 + + xsmaddadp vs40, vs8, vs29 + + xsmaddadp vs48, vs8, vs30 + + xsmaddadp vs56, vs8, vs31 + +.endm + +.macro KERNEL4x1_E2 + + + xsmaddadp vs32, vs8, vs28 + + xsmaddadp vs40, vs8, vs29 + + xsmaddadp vs48, vs8, vs30 + + xsmaddadp vs56, vs8, vs31 + +.endm + +.macro KERNEL4x1_SUBI1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + lxsdx vs26, o16, BO + lxsdx vs27, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmuldp vs32, vs0, vs24 + + xsmuldp vs40, vs0, vs25 + + xsmuldp vs48, vs0, vs26 + + xsmuldp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x1_SUB1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + lxsdx vs26, o16, BO + lxsdx vs27, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmaddadp vs32, vs0, vs24 + + xsmaddadp vs40, vs0, vs25 + + xsmaddadp vs48, vs0, vs26 + + xsmaddadp vs56, vs0, vs27 + +.endm + +.macro SAVE4x1 + + mr T1, CO + +#ifndef TRMMKERNEL + lxsdx vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs0, vs32, alpha_r +#else + xsmuldp vs0, vs32, alpha_r +#endif + + stxsdx vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxsdx vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs8, vs40, alpha_r +#else + xsmuldp vs8, vs40, alpha_r +#endif + + stxsdx vs8, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxsdx vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs0, vs48, alpha_r +#else + xsmuldp vs0, vs48, alpha_r +#endif + + stxsdx vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxsdx vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs8, vs56, alpha_r +#else + xsmuldp vs8, vs56, alpha_r +#endif + + stxsdx vs8, 0, T1 + + addi CO, CO, 8 + +.endm + +/********************************************************************* +* Macros for N=2, M=16 * +*********************************************************************/ + +.macro LOAD2x16_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + +.endm + +.macro KERNEL2x16_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + addi AO, AO, 64 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + xvmuldp vs44, vs4, vs25 + xvmuldp vs45, vs5, vs25 + xvmuldp vs46, vs6, vs25 + xvmuldp vs47, vs7, vs25 + +.endm + +.macro KERNEL2x16_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + xvmaddadp vs44, vs4, vs25 + xvmaddadp vs45, vs5, vs25 + xvmaddadp vs46, vs6, vs25 + xvmaddadp vs47, vs7, vs25 + +.endm + +.macro KERNEL2x16_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + xvmaddadp vs44, vs12, vs29 + xvmaddadp vs45, vs13, vs29 + xvmaddadp vs46, vs14, vs29 + xvmaddadp vs47, vs15, vs29 + +.endm + +.macro KERNEL2x16_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + xvmaddadp vs44, vs12, vs29 + xvmaddadp vs45, vs13, vs29 + xvmaddadp vs46, vs14, vs29 + xvmaddadp vs47, vs15, vs29 + +.endm + +.macro KERNEL2x16_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + xvmuldp vs44, vs4, vs25 + xvmuldp vs45, vs5, vs25 + xvmuldp vs46, vs6, vs25 + xvmuldp vs47, vs7, vs25 + +.endm + +.macro KERNEL2x16_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + xvmaddadp vs44, vs4, vs25 + xvmaddadp vs45, vs5, vs25 + xvmaddadp vs46, vs6, vs25 + xvmaddadp vs47, vs7, vs25 + +.endm + +.macro SAVE2x16 + + mr T1, CO + addi T2, T1, 64 + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 + + lxvd2x vs4, 0, T2 + lxvd2x vs5, o16, T2 + lxvd2x vs6, o32, T2 + lxvd2x vs7, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r + xvmaddadp vs4, vs36, alpha_r + xvmaddadp vs5, vs37, alpha_r + xvmaddadp vs6, vs38, alpha_r + xvmaddadp vs7, vs39, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r + xvmuldp vs4, vs36, alpha_r + xvmuldp vs5, vs37, alpha_r + xvmuldp vs6, vs38, alpha_r + xvmuldp vs7, vs39, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + stxvd2x vs4, 0, T2 + stxvd2x vs5, o16, T2 + stxvd2x vs6, o32, T2 + stxvd2x vs7, o48, T2 + + add T1, T1, LDC + add T2, T2, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 + lxvd2x vs10, o32, T1 + lxvd2x vs11, o48, T1 + + lxvd2x vs12, 0, T2 + lxvd2x vs13, o16, T2 + lxvd2x vs14, o32, T2 + lxvd2x vs15, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r + xvmaddadp vs10, vs42, alpha_r + xvmaddadp vs11, vs43, alpha_r + xvmaddadp vs12, vs44, alpha_r + xvmaddadp vs13, vs45, alpha_r + xvmaddadp vs14, vs46, alpha_r + xvmaddadp vs15, vs47, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r + xvmuldp vs10, vs42, alpha_r + xvmuldp vs11, vs43, alpha_r + xvmuldp vs12, vs44, alpha_r + xvmuldp vs13, vs45, alpha_r + xvmuldp vs14, vs46, alpha_r + xvmuldp vs15, vs47, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + stxvd2x vs12, 0, T2 + stxvd2x vs13, o16, T2 + stxvd2x vs14, o32, T2 + stxvd2x vs15, o48, T2 + + addi CO, CO, 128 + +.endm + +/********************************************************************* +* Macros for N=4, M=8 * +*********************************************************************/ + +.macro LOAD2x8_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + +.endm + +.macro KERNEL2x8_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + +.endm + +.macro KERNEL2x8_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + +.endm + +.macro KERNEL2x8_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + +.endm + +.macro KERNEL2x8_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + +.endm + +.macro KERNEL2x8_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + +.endm + +.macro KERNEL2x8_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + +.endm + +.macro SAVE2x8 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 + lxvd2x vs10, o32, T1 + lxvd2x vs11, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r + xvmaddadp vs10, vs42, alpha_r + xvmaddadp vs11, vs43, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r + xvmuldp vs10, vs42, alpha_r + xvmuldp vs11, vs43, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + addi CO, CO, 64 + +.endm + +/********************************************************************* +* Macros for N=2, M=4 * +*********************************************************************/ + +.macro LOAD2x4_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + +.endm + +.macro KERNEL2x4_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + +.endm + +.macro KERNEL2x4_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + +.endm + +.macro KERNEL2x4_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + +.endm + +.macro KERNEL2x4_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + +.endm + +.macro KERNEL2x4_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + +.endm + +.macro KERNEL2x4_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + +.endm + +.macro SAVE2x4 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + + addi CO, CO, 32 + +.endm + +/********************************************************************* +* Macros for N=2, M=2 * +*********************************************************************/ + +.macro LOAD2x2_1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + +.endm + +.macro KERNEL2x2_I1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + + xvmuldp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x2_1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + + xvmaddadp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x2_2 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmaddadp vs32, vs8, vs28 + + xvmaddadp vs40, vs8, vs29 + +.endm + +.macro KERNEL2x2_E2 + + + xvmaddadp vs32, vs8, vs28 + + xvmaddadp vs40, vs8, vs29 + +.endm + +.macro KERNEL2x2_SUBI1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + + xvmuldp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x2_SUB1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + + xvmaddadp vs40, vs0, vs25 + +.endm + +.macro SAVE2x2 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r +#else + xvmuldp vs0, vs32, alpha_r +#endif + + stxvd2x vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r +#else + xvmuldp vs8, vs40, alpha_r +#endif + + stxvd2x vs8, 0, T1 + + addi CO, CO, 16 + +.endm + +/********************************************************************* +* Macros for N=2, M=1 * +*********************************************************************/ + +.macro LOAD2x1_1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + +.endm + +.macro KERNEL2x1_I1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + lxsdx vs29, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmuldp vs32, vs0, vs24 + + xsmuldp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x1_1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + lxsdx vs29, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmaddadp vs32, vs0, vs24 + + xsmaddadp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x1_2 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmaddadp vs32, vs8, vs28 + + xsmaddadp vs40, vs8, vs29 + +.endm + +.macro KERNEL2x1_E2 + + + xsmaddadp vs32, vs8, vs28 + + xsmaddadp vs40, vs8, vs29 + +.endm + +.macro KERNEL2x1_SUBI1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmuldp vs32, vs0, vs24 + + xsmuldp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x1_SUB1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmaddadp vs32, vs0, vs24 + + xsmaddadp vs40, vs0, vs25 + +.endm + +.macro SAVE2x1 + + mr T1, CO + +#ifndef TRMMKERNEL + lxsdx vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs0, vs32, alpha_r +#else + xsmuldp vs0, vs32, alpha_r +#endif + + stxsdx vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxsdx vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs8, vs40, alpha_r +#else + xsmuldp vs8, vs40, alpha_r +#endif + + stxsdx vs8, 0, T1 + + addi CO, CO, 8 + +.endm + +/********************************************************************* +* Macros for N=1, M=16 * +*********************************************************************/ + +.macro LOAD1x16_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + +.endm + +.macro KERNEL1x16_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + addi AO, AO, 64 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + +.endm + +.macro KERNEL1x16_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + +.endm + +.macro KERNEL1x16_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + +.endm + +.macro KERNEL1x16_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + +.endm + +.macro KERNEL1x16_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + +.endm + +.macro KERNEL1x16_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + +.endm + +.macro SAVE1x16 + + mr T1, CO + addi T2, T1, 64 + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 + + lxvd2x vs4, 0, T2 + lxvd2x vs5, o16, T2 + lxvd2x vs6, o32, T2 + lxvd2x vs7, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r + xvmaddadp vs4, vs36, alpha_r + xvmaddadp vs5, vs37, alpha_r + xvmaddadp vs6, vs38, alpha_r + xvmaddadp vs7, vs39, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r + xvmuldp vs4, vs36, alpha_r + xvmuldp vs5, vs37, alpha_r + xvmuldp vs6, vs38, alpha_r + xvmuldp vs7, vs39, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + stxvd2x vs4, 0, T2 + stxvd2x vs5, o16, T2 + stxvd2x vs6, o32, T2 + stxvd2x vs7, o48, T2 + + addi CO, CO, 128 + +.endm + +/********************************************************************* +* Macros for N=4, M=8 * +*********************************************************************/ + +.macro LOAD1x8_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + +.endm + +.macro KERNEL1x8_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + +.endm + +.macro KERNEL1x8_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + +.endm + +.macro KERNEL1x8_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + +.endm + +.macro KERNEL1x8_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + +.endm + +.macro KERNEL1x8_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + +.endm + +.macro KERNEL1x8_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + +.endm + +.macro SAVE1x8 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + addi CO, CO, 64 + +.endm + +/********************************************************************* +* Macros for N=1, M=4 * +*********************************************************************/ + +.macro LOAD1x4_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + +.endm + +.macro KERNEL1x4_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + +.endm + +.macro KERNEL1x4_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + +.endm + +.macro KERNEL1x4_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + +.endm + +.macro KERNEL1x4_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + +.endm + +.macro KERNEL1x4_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + +.endm + +.macro KERNEL1x4_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + +.endm + +.macro SAVE1x4 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + + addi CO, CO, 32 + +.endm + +/********************************************************************* +* Macros for N=1, M=2 * +*********************************************************************/ + +.macro LOAD1x2_1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + +.endm + +.macro KERNEL1x2_I1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x2_1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x2_2 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmaddadp vs32, vs8, vs28 + +.endm + +.macro KERNEL1x2_E2 + + + xvmaddadp vs32, vs8, vs28 + +.endm + +.macro KERNEL1x2_SUBI1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x2_SUB1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + +.endm + +.macro SAVE1x2 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r +#else + xvmuldp vs0, vs32, alpha_r +#endif + + stxvd2x vs0, 0, T1 + + addi CO, CO, 16 + +.endm + +/********************************************************************* +* Macros for N=1, M=1 * +*********************************************************************/ + +.macro LOAD1x1_1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + +.endm + +.macro KERNEL1x1_I1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmuldp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x1_1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmaddadp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x1_2 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmaddadp vs32, vs8, vs28 + +.endm + +.macro KERNEL1x1_E2 + + + xsmaddadp vs32, vs8, vs28 + +.endm + +.macro KERNEL1x1_SUBI1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmuldp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x1_SUB1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmaddadp vs32, vs0, vs24 + +.endm + +.macro SAVE1x1 + + mr T1, CO + +#ifndef TRMMKERNEL + lxsdx vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs0, vs32, alpha_r +#else + xsmuldp vs0, vs32, alpha_r +#endif + + stxsdx vs0, 0, T1 + + addi CO, CO, 8 + +.endm + diff --git a/kernel/power/dtrmm_kernel_16x4_power8.S b/kernel/power/dtrmm_kernel_16x4_power8.S new file mode 100644 index 0000000..c892c65 --- /dev/null +++ b/kernel/power/dtrmm_kernel_16x4_power8.S @@ -0,0 +1,327 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA_SP 296(SP) +#define FZERO 304(SP) +#else +#define STACKSIZE 240 +#define ALPHA_SP 224(SP) +#define FZERO 232(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r8 +#define B r9 +#define C r10 +#define LDC r7 +#define OFFSET r6 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#define alpha_r vs18 + +#define o0 0 + +#define K1 r13 +#define KKK r14 +#define o8 r15 +#define o24 r16 +#define ALPHA r17 +#define L r18 +#define T1 r19 +#define KK r20 +#define BB r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define o16 r27 +#define o32 r28 +#define o48 r29 + +#define PRE r30 +#define T2 r31 + +#include "dgemm_macros_16x4_power8.S" + + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + std r13, 288(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) + stw r19, 192(SP) + stw r18, 196(SP) + stw r17, 200(SP) + stw r16, 204(SP) + stw r15, 208(SP) + stw r14, 212(SP) + stw r13, 216(SP) +#endif + + stfd f1, ALPHA_SP + stw r0, FZERO + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) +#endif +#endif + + slwi LDC, LDC, BASE_SHIFT + +#if defined(TRMMKERNEL) +#if defined(linux) && defined(__64BIT__) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) +#else + lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) +#endif +#endif +#endif +#endif + + mr KK, OFFSET +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, KK +#endif + + cmpwi cr0, M, 0 + ble L999_H1 + cmpwi cr0, N, 0 + ble L999_H1 + cmpwi cr0, K, 0 + ble L999_H1 + +#ifdef __64BIT__ + addi ALPHA, SP, 296 +#else + addi ALPHA, SP, 224 +#endif + + li PRE, 256 + li o8 , 8 + li o16, 16 + li o24, 24 + li o32, 32 + li o48, 48 + + lxvdsx alpha_r, 0, ALPHA + +#include "dtrmm_logic_16x4_power8.S" + +L999: + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + ld r13, 288(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) + lwz r19, 192(SP) + lwz r18, 196(SP) + lwz r17, 200(SP) + lwz r16, 204(SP) + lwz r15, 208(SP) + lwz r14, 212(SP) + lwz r13, 216(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/dtrmm_logic_16x4_power8.S b/kernel/power/dtrmm_logic_16x4_power8.S new file mode 100644 index 0000000..f2886f8 --- /dev/null +++ b/kernel/power/dtrmm_logic_16x4_power8.S @@ -0,0 +1,2202 @@ + srawi. J, N, 2 + ble DTRMM_L4_END + +DTRMM_L4_BEGIN: + + mr CO, C + mr AO, A + slwi T1, LDC , 2 + add C, C, T1 + +#if defined(LEFT) + mr KK, OFFSET // OFFSET -> KK +#endif + + srawi. I, M, 4 + ble DTRMM_L4x16_END + +DTRMM_L4x16_BEGIN: + + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 5 // Number of values in B shifted + slwi T2, KK, 7 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 16 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble DTRMM_L4x16_SUB0 + cmpwi cr0, L, 1 + ble DTRMM_L4x16_SUB4 + +DTRMM_L4x16_LOOP_START: + + dcbt AO, PRE + LOAD4x16_1 + dcbt AO, PRE + KERNEL4x16_I1 + dcbt AO, PRE + KERNEL4x16_2 + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + addic. L, L, -2 + ble DTRMM_L4x16_LOOP_END + + .align 5 + +DTRMM_L4x16_LOOP: + + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + addic. L, L, -1 + bgt DTRMM_L4x16_LOOP + +DTRMM_L4x16_LOOP_END: + + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + dcbt AO, PRE + KERNEL4x16_1 + KERNEL4x16_E2 + + b DTRMM_L4x16_SUB1 + +DTRMM_L4x16_SUB4: + + dcbt AO, PRE + KERNEL4x16_SUBI1 + dcbt AO, PRE + KERNEL4x16_SUB1 + dcbt AO, PRE + KERNEL4x16_SUB1 + dcbt AO, PRE + KERNEL4x16_SUB1 + + KERNEL4x16_SUB1 + KERNEL4x16_SUB1 + KERNEL4x16_SUB1 + KERNEL4x16_SUB1 + + b DTRMM_L4x16_SUB1 + +DTRMM_L4x16_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL4x16_SUBI1 + + addic. L, L, -1 + ble DTRMM_L4x16_SAVE + b DTRMM_L4x16_SUB2 + +DTRMM_L4x16_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble DTRMM_L4x16_SAVE + +DTRMM_L4x16_SUB2: + + KERNEL4x16_SUB1 + + addic. L, L, -1 + bgt DTRMM_L4x16_SUB2 + +DTRMM_L4x16_SAVE: + + SAVE4x16 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 7 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 16 // KK += Number of values in A +#endif + + + addic. I, I, -1 + bgt DTRMM_L4x16_BEGIN + +DTRMM_L4x16_END: + +DTRMM_L4x8_BEGIN: + andi. T2, M, 15 + ble DTRMM_L4x1_END + + andi. T1, M, 8 + ble DTRMM_L4x8_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 5 // Number of values in B shifted + slwi T2, KK, 6 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble DTRMM_L4x8_SUB0 + cmpwi cr0, L, 1 + ble DTRMM_L4x8_SUB4 + +DTRMM_L4x8_LOOP_START: + + LOAD4x8_1 + KERNEL4x8_I1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + addic. L, L, -2 + ble DTRMM_L4x8_LOOP_END + + .align 5 + +DTRMM_L4x8_LOOP: + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + addic. L, L, -1 + bgt DTRMM_L4x8_LOOP + +DTRMM_L4x8_LOOP_END: + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_E2 + + b DTRMM_L4x8_SUB1 + +DTRMM_L4x8_SUB4: + + KERNEL4x8_SUBI1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + + b DTRMM_L4x8_SUB1 + +DTRMM_L4x8_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL4x8_SUBI1 + + addic. L, L, -1 + ble DTRMM_L4x8_SAVE + b DTRMM_L4x8_SUB2 + +DTRMM_L4x8_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble DTRMM_L4x8_SAVE + +DTRMM_L4x8_SUB2: + + KERNEL4x8_SUB1 + + addic. L, L, -1 + bgt DTRMM_L4x8_SUB2 + +DTRMM_L4x8_SAVE: + + SAVE4x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 8 // KK += Number of values in A +#endif + + +DTRMM_L4x8_END: + +DTRMM_L4x4_BEGIN: + + andi. T1, M, 4 + ble DTRMM_L4x4_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 5 // Number of values in B shifted + slwi T2, KK, 5 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble DTRMM_L4x4_SUB0 + cmpwi cr0, L, 1 + ble DTRMM_L4x4_SUB4 + +DTRMM_L4x4_LOOP_START: + + LOAD4x4_1 + KERNEL4x4_I1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + addic. L, L, -2 + ble DTRMM_L4x4_LOOP_END + + .align 5 + +DTRMM_L4x4_LOOP: + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + addic. L, L, -1 + bgt DTRMM_L4x4_LOOP + +DTRMM_L4x4_LOOP_END: + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_E2 + + b DTRMM_L4x4_SUB1 + +DTRMM_L4x4_SUB4: + + KERNEL4x4_SUBI1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + + b DTRMM_L4x4_SUB1 + +DTRMM_L4x4_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL4x4_SUBI1 + + addic. L, L, -1 + ble DTRMM_L4x4_SAVE + b DTRMM_L4x4_SUB2 + +DTRMM_L4x4_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble DTRMM_L4x4_SAVE + +DTRMM_L4x4_SUB2: + + KERNEL4x4_SUB1 + + addic. L, L, -1 + bgt DTRMM_L4x4_SUB2 + +DTRMM_L4x4_SAVE: + + SAVE4x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 4 // KK += Number of values in A +#endif + + +DTRMM_L4x4_END: + +DTRMM_L4x2_BEGIN: + + andi. T1, M, 2 + ble DTRMM_L4x2_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 5 // Number of values in B shifted + slwi T2, KK, 4 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble DTRMM_L4x2_SUB0 + cmpwi cr0, L, 1 + ble DTRMM_L4x2_SUB4 + +DTRMM_L4x2_LOOP_START: + + LOAD4x2_1 + KERNEL4x2_I1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + addic. L, L, -2 + ble DTRMM_L4x2_LOOP_END + + .align 5 + +DTRMM_L4x2_LOOP: + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + addic. L, L, -1 + bgt DTRMM_L4x2_LOOP + +DTRMM_L4x2_LOOP_END: + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_E2 + + b DTRMM_L4x2_SUB1 + +DTRMM_L4x2_SUB4: + + KERNEL4x2_SUBI1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + + b DTRMM_L4x2_SUB1 + +DTRMM_L4x2_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL4x2_SUBI1 + + addic. L, L, -1 + ble DTRMM_L4x2_SAVE + b DTRMM_L4x2_SUB2 + +DTRMM_L4x2_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble DTRMM_L4x2_SAVE + +DTRMM_L4x2_SUB2: + + KERNEL4x2_SUB1 + + addic. L, L, -1 + bgt DTRMM_L4x2_SUB2 + +DTRMM_L4x2_SAVE: + + SAVE4x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 2 // KK += Number of values in A +#endif + + +DTRMM_L4x2_END: + +DTRMM_L4x1_BEGIN: + + andi. T1, M, 1 + ble DTRMM_L4x1_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 5 // Number of values in B shifted + slwi T2, KK, 3 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble DTRMM_L4x1_SUB0 + cmpwi cr0, L, 1 + ble DTRMM_L4x1_SUB4 + +DTRMM_L4x1_LOOP_START: + + LOAD4x1_1 + KERNEL4x1_I1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + addic. L, L, -2 + ble DTRMM_L4x1_LOOP_END + + .align 5 + +DTRMM_L4x1_LOOP: + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + addic. L, L, -1 + bgt DTRMM_L4x1_LOOP + +DTRMM_L4x1_LOOP_END: + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_E2 + + b DTRMM_L4x1_SUB1 + +DTRMM_L4x1_SUB4: + + KERNEL4x1_SUBI1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + + b DTRMM_L4x1_SUB1 + +DTRMM_L4x1_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL4x1_SUBI1 + + addic. L, L, -1 + ble DTRMM_L4x1_SAVE + b DTRMM_L4x1_SUB2 + +DTRMM_L4x1_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble DTRMM_L4x1_SAVE + +DTRMM_L4x1_SUB2: + + KERNEL4x1_SUB1 + + addic. L, L, -1 + bgt DTRMM_L4x1_SUB2 + +DTRMM_L4x1_SAVE: + + SAVE4x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 1 // KK += Number of values in A +#endif + + +DTRMM_L4x1_END: + + slwi T1, K, 5 + add B, B, T1 + +#if !defined(LEFT) + addi KK, KK, 4 // KK += Number of values in B +#endif + + + addic. J, J, -1 + bgt DTRMM_L4_BEGIN + + andi. T2, N, 3 + ble L999 + +DTRMM_L4_END: + + b DTRMM_L2_BEGIN + +L999_H1: + + b L999 + +DTRMM_L2_BEGIN: + + andi. T1, N, 2 + ble DTRMM_L2_END + mr CO, C + mr AO, A + slwi T1, LDC , 1 + add C, C, T1 + +#if defined(LEFT) + mr KK, OFFSET // OFFSET -> KK +#endif + + srawi. I, M, 4 + ble DTRMM_L2x16_END + +DTRMM_L2x16_BEGIN: + + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 4 // Number of values in B shifted + slwi T2, KK, 7 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 16 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble DTRMM_L2x16_SUB0 + cmpwi cr0, L, 1 + ble DTRMM_L2x16_SUB4 + +DTRMM_L2x16_LOOP_START: + + dcbt AO, PRE + LOAD2x16_1 + dcbt AO, PRE + KERNEL2x16_I1 + dcbt AO, PRE + KERNEL2x16_2 + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + addic. L, L, -2 + ble DTRMM_L2x16_LOOP_END + + .align 5 + +DTRMM_L2x16_LOOP: + + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + addic. L, L, -1 + bgt DTRMM_L2x16_LOOP + +DTRMM_L2x16_LOOP_END: + + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + dcbt AO, PRE + KERNEL2x16_1 + KERNEL2x16_E2 + + b DTRMM_L2x16_SUB1 + +DTRMM_L2x16_SUB4: + + dcbt AO, PRE + KERNEL2x16_SUBI1 + dcbt AO, PRE + KERNEL2x16_SUB1 + dcbt AO, PRE + KERNEL2x16_SUB1 + dcbt AO, PRE + KERNEL2x16_SUB1 + + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + + b DTRMM_L2x16_SUB1 + +DTRMM_L2x16_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL2x16_SUBI1 + + addic. L, L, -1 + ble DTRMM_L2x16_SAVE + b DTRMM_L2x16_SUB2 + +DTRMM_L2x16_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble DTRMM_L2x16_SAVE + +DTRMM_L2x16_SUB2: + + KERNEL2x16_SUB1 + + addic. L, L, -1 + bgt DTRMM_L2x16_SUB2 + +DTRMM_L2x16_SAVE: + + SAVE2x16 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 7 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 16 // KK += Number of values in A +#endif + + + addic. I, I, -1 + bgt DTRMM_L2x16_BEGIN + +DTRMM_L2x16_END: + +DTRMM_L2x8_BEGIN: + andi. T2, M, 15 + ble DTRMM_L2x1_END + + andi. T1, M, 8 + ble DTRMM_L2x8_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 4 // Number of values in B shifted + slwi T2, KK, 6 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble DTRMM_L2x8_SUB0 + cmpwi cr0, L, 1 + ble DTRMM_L2x8_SUB4 + +DTRMM_L2x8_LOOP_START: + + LOAD2x8_1 + KERNEL2x8_I1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + addic. L, L, -2 + ble DTRMM_L2x8_LOOP_END + + .align 5 + +DTRMM_L2x8_LOOP: + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + addic. L, L, -1 + bgt DTRMM_L2x8_LOOP + +DTRMM_L2x8_LOOP_END: + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_E2 + + b DTRMM_L2x8_SUB1 + +DTRMM_L2x8_SUB4: + + KERNEL2x8_SUBI1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + + b DTRMM_L2x8_SUB1 + +DTRMM_L2x8_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL2x8_SUBI1 + + addic. L, L, -1 + ble DTRMM_L2x8_SAVE + b DTRMM_L2x8_SUB2 + +DTRMM_L2x8_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble DTRMM_L2x8_SAVE + +DTRMM_L2x8_SUB2: + + KERNEL2x8_SUB1 + + addic. L, L, -1 + bgt DTRMM_L2x8_SUB2 + +DTRMM_L2x8_SAVE: + + SAVE2x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 8 // KK += Number of values in A +#endif + + +DTRMM_L2x8_END: + +DTRMM_L2x4_BEGIN: + + andi. T1, M, 4 + ble DTRMM_L2x4_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 4 // Number of values in B shifted + slwi T2, KK, 5 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble DTRMM_L2x4_SUB0 + cmpwi cr0, L, 1 + ble DTRMM_L2x4_SUB4 + +DTRMM_L2x4_LOOP_START: + + LOAD2x4_1 + KERNEL2x4_I1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -2 + ble DTRMM_L2x4_LOOP_END + + .align 5 + +DTRMM_L2x4_LOOP: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -1 + bgt DTRMM_L2x4_LOOP + +DTRMM_L2x4_LOOP_END: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_E2 + + b DTRMM_L2x4_SUB1 + +DTRMM_L2x4_SUB4: + + KERNEL2x4_SUBI1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + b DTRMM_L2x4_SUB1 + +DTRMM_L2x4_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL2x4_SUBI1 + + addic. L, L, -1 + ble DTRMM_L2x4_SAVE + b DTRMM_L2x4_SUB2 + +DTRMM_L2x4_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble DTRMM_L2x4_SAVE + +DTRMM_L2x4_SUB2: + + KERNEL2x4_SUB1 + + addic. L, L, -1 + bgt DTRMM_L2x4_SUB2 + +DTRMM_L2x4_SAVE: + + SAVE2x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 4 // KK += Number of values in A +#endif + + +DTRMM_L2x4_END: + +DTRMM_L2x2_BEGIN: + + andi. T1, M, 2 + ble DTRMM_L2x2_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 4 // Number of values in B shifted + slwi T2, KK, 4 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble DTRMM_L2x2_SUB0 + cmpwi cr0, L, 1 + ble DTRMM_L2x2_SUB4 + +DTRMM_L2x2_LOOP_START: + + LOAD2x2_1 + KERNEL2x2_I1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -2 + ble DTRMM_L2x2_LOOP_END + + .align 5 + +DTRMM_L2x2_LOOP: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -1 + bgt DTRMM_L2x2_LOOP + +DTRMM_L2x2_LOOP_END: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_E2 + + b DTRMM_L2x2_SUB1 + +DTRMM_L2x2_SUB4: + + KERNEL2x2_SUBI1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + b DTRMM_L2x2_SUB1 + +DTRMM_L2x2_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL2x2_SUBI1 + + addic. L, L, -1 + ble DTRMM_L2x2_SAVE + b DTRMM_L2x2_SUB2 + +DTRMM_L2x2_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble DTRMM_L2x2_SAVE + +DTRMM_L2x2_SUB2: + + KERNEL2x2_SUB1 + + addic. L, L, -1 + bgt DTRMM_L2x2_SUB2 + +DTRMM_L2x2_SAVE: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 2 // KK += Number of values in A +#endif + + +DTRMM_L2x2_END: + +DTRMM_L2x1_BEGIN: + + andi. T1, M, 1 + ble DTRMM_L2x1_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 4 // Number of values in B shifted + slwi T2, KK, 3 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble DTRMM_L2x1_SUB0 + cmpwi cr0, L, 1 + ble DTRMM_L2x1_SUB4 + +DTRMM_L2x1_LOOP_START: + + LOAD2x1_1 + KERNEL2x1_I1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -2 + ble DTRMM_L2x1_LOOP_END + + .align 5 + +DTRMM_L2x1_LOOP: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -1 + bgt DTRMM_L2x1_LOOP + +DTRMM_L2x1_LOOP_END: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_E2 + + b DTRMM_L2x1_SUB1 + +DTRMM_L2x1_SUB4: + + KERNEL2x1_SUBI1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + b DTRMM_L2x1_SUB1 + +DTRMM_L2x1_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL2x1_SUBI1 + + addic. L, L, -1 + ble DTRMM_L2x1_SAVE + b DTRMM_L2x1_SUB2 + +DTRMM_L2x1_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble DTRMM_L2x1_SAVE + +DTRMM_L2x1_SUB2: + + KERNEL2x1_SUB1 + + addic. L, L, -1 + bgt DTRMM_L2x1_SUB2 + +DTRMM_L2x1_SAVE: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 1 // KK += Number of values in A +#endif + + +DTRMM_L2x1_END: + + slwi T1, K, 4 + add B, B, T1 + +#if !defined(LEFT) + addi KK, KK, 2 // KK += Number of values in B +#endif + + +DTRMM_L2_END: +DTRMM_L1_BEGIN: + + andi. T1, N, 1 + ble DTRMM_L1_END + mr CO, C + mr AO, A + +#if defined(LEFT) + mr KK, OFFSET // OFFSET -> KK +#endif + + srawi. I, M, 4 + ble DTRMM_L1x16_END + +DTRMM_L1x16_BEGIN: + + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 3 // Number of values in B shifted + slwi T2, KK, 7 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 16 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble DTRMM_L1x16_SUB0 + cmpwi cr0, L, 1 + ble DTRMM_L1x16_SUB4 + +DTRMM_L1x16_LOOP_START: + + dcbt AO, PRE + LOAD1x16_1 + dcbt AO, PRE + KERNEL1x16_I1 + dcbt AO, PRE + KERNEL1x16_2 + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + addic. L, L, -2 + ble DTRMM_L1x16_LOOP_END + + .align 5 + +DTRMM_L1x16_LOOP: + + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + addic. L, L, -1 + bgt DTRMM_L1x16_LOOP + +DTRMM_L1x16_LOOP_END: + + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + dcbt AO, PRE + KERNEL1x16_1 + KERNEL1x16_E2 + + b DTRMM_L1x16_SUB1 + +DTRMM_L1x16_SUB4: + + dcbt AO, PRE + KERNEL1x16_SUBI1 + dcbt AO, PRE + KERNEL1x16_SUB1 + dcbt AO, PRE + KERNEL1x16_SUB1 + dcbt AO, PRE + KERNEL1x16_SUB1 + + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + + b DTRMM_L1x16_SUB1 + +DTRMM_L1x16_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL1x16_SUBI1 + + addic. L, L, -1 + ble DTRMM_L1x16_SAVE + b DTRMM_L1x16_SUB2 + +DTRMM_L1x16_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble DTRMM_L1x16_SAVE + +DTRMM_L1x16_SUB2: + + KERNEL1x16_SUB1 + + addic. L, L, -1 + bgt DTRMM_L1x16_SUB2 + +DTRMM_L1x16_SAVE: + + SAVE1x16 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 7 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 16 // KK += Number of values in A +#endif + + + addic. I, I, -1 + bgt DTRMM_L1x16_BEGIN + +DTRMM_L1x16_END: + +DTRMM_L1x8_BEGIN: + andi. T2, M, 15 + ble DTRMM_L1x1_END + + andi. T1, M, 8 + ble DTRMM_L1x8_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 3 // Number of values in B shifted + slwi T2, KK, 6 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble DTRMM_L1x8_SUB0 + cmpwi cr0, L, 1 + ble DTRMM_L1x8_SUB4 + +DTRMM_L1x8_LOOP_START: + + LOAD1x8_1 + KERNEL1x8_I1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + addic. L, L, -2 + ble DTRMM_L1x8_LOOP_END + + .align 5 + +DTRMM_L1x8_LOOP: + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + addic. L, L, -1 + bgt DTRMM_L1x8_LOOP + +DTRMM_L1x8_LOOP_END: + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_E2 + + b DTRMM_L1x8_SUB1 + +DTRMM_L1x8_SUB4: + + KERNEL1x8_SUBI1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + + b DTRMM_L1x8_SUB1 + +DTRMM_L1x8_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL1x8_SUBI1 + + addic. L, L, -1 + ble DTRMM_L1x8_SAVE + b DTRMM_L1x8_SUB2 + +DTRMM_L1x8_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble DTRMM_L1x8_SAVE + +DTRMM_L1x8_SUB2: + + KERNEL1x8_SUB1 + + addic. L, L, -1 + bgt DTRMM_L1x8_SUB2 + +DTRMM_L1x8_SAVE: + + SAVE1x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 8 // KK += Number of values in A +#endif + + +DTRMM_L1x8_END: + +DTRMM_L1x4_BEGIN: + + andi. T1, M, 4 + ble DTRMM_L1x4_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 3 // Number of values in B shifted + slwi T2, KK, 5 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble DTRMM_L1x4_SUB0 + cmpwi cr0, L, 1 + ble DTRMM_L1x4_SUB4 + +DTRMM_L1x4_LOOP_START: + + LOAD1x4_1 + KERNEL1x4_I1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -2 + ble DTRMM_L1x4_LOOP_END + + .align 5 + +DTRMM_L1x4_LOOP: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -1 + bgt DTRMM_L1x4_LOOP + +DTRMM_L1x4_LOOP_END: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_E2 + + b DTRMM_L1x4_SUB1 + +DTRMM_L1x4_SUB4: + + KERNEL1x4_SUBI1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + b DTRMM_L1x4_SUB1 + +DTRMM_L1x4_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL1x4_SUBI1 + + addic. L, L, -1 + ble DTRMM_L1x4_SAVE + b DTRMM_L1x4_SUB2 + +DTRMM_L1x4_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble DTRMM_L1x4_SAVE + +DTRMM_L1x4_SUB2: + + KERNEL1x4_SUB1 + + addic. L, L, -1 + bgt DTRMM_L1x4_SUB2 + +DTRMM_L1x4_SAVE: + + SAVE1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 4 // KK += Number of values in A +#endif + + +DTRMM_L1x4_END: + +DTRMM_L1x2_BEGIN: + + andi. T1, M, 2 + ble DTRMM_L1x2_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 3 // Number of values in B shifted + slwi T2, KK, 4 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble DTRMM_L1x2_SUB0 + cmpwi cr0, L, 1 + ble DTRMM_L1x2_SUB4 + +DTRMM_L1x2_LOOP_START: + + LOAD1x2_1 + KERNEL1x2_I1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -2 + ble DTRMM_L1x2_LOOP_END + + .align 5 + +DTRMM_L1x2_LOOP: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -1 + bgt DTRMM_L1x2_LOOP + +DTRMM_L1x2_LOOP_END: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_E2 + + b DTRMM_L1x2_SUB1 + +DTRMM_L1x2_SUB4: + + KERNEL1x2_SUBI1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + b DTRMM_L1x2_SUB1 + +DTRMM_L1x2_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL1x2_SUBI1 + + addic. L, L, -1 + ble DTRMM_L1x2_SAVE + b DTRMM_L1x2_SUB2 + +DTRMM_L1x2_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble DTRMM_L1x2_SAVE + +DTRMM_L1x2_SUB2: + + KERNEL1x2_SUB1 + + addic. L, L, -1 + bgt DTRMM_L1x2_SUB2 + +DTRMM_L1x2_SAVE: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 2 // KK += Number of values in A +#endif + + +DTRMM_L1x2_END: + +DTRMM_L1x1_BEGIN: + + andi. T1, M, 1 + ble DTRMM_L1x1_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 3 // Number of values in B shifted + slwi T2, KK, 3 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble DTRMM_L1x1_SUB0 + cmpwi cr0, L, 1 + ble DTRMM_L1x1_SUB4 + +DTRMM_L1x1_LOOP_START: + + LOAD1x1_1 + KERNEL1x1_I1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -2 + ble DTRMM_L1x1_LOOP_END + + .align 5 + +DTRMM_L1x1_LOOP: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -1 + bgt DTRMM_L1x1_LOOP + +DTRMM_L1x1_LOOP_END: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_E2 + + b DTRMM_L1x1_SUB1 + +DTRMM_L1x1_SUB4: + + KERNEL1x1_SUBI1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + b DTRMM_L1x1_SUB1 + +DTRMM_L1x1_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL1x1_SUBI1 + + addic. L, L, -1 + ble DTRMM_L1x1_SAVE + b DTRMM_L1x1_SUB2 + +DTRMM_L1x1_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble DTRMM_L1x1_SAVE + +DTRMM_L1x1_SUB2: + + KERNEL1x1_SUB1 + + addic. L, L, -1 + bgt DTRMM_L1x1_SUB2 + +DTRMM_L1x1_SAVE: + + SAVE1x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 1 // KK += Number of values in A +#endif + + +DTRMM_L1x1_END: + +#if !defined(LEFT) + addi KK, KK, 1 // KK += Number of values in B +#endif + + +DTRMM_L1_END: diff --git a/kernel/power/gemm_ncopy_4.S b/kernel/power/gemm_ncopy_4.S index d7cfe5e..c6e69b4 100644 --- a/kernel/power/gemm_ncopy_4.S +++ b/kernel/power/gemm_ncopy_4.S @@ -104,12 +104,12 @@ #define PREFETCHWSIZE 72 #endif -#ifdef POWER8 +#ifdef PPCG4 #define PREFETCHSIZE 16 #define PREFETCHWSIZE 72 #endif -#ifdef PPCG4 +#ifdef POWER8 #define PREFETCHSIZE 16 #define PREFETCHWSIZE 72 #endif @@ -198,7 +198,7 @@ LL(12): STFD c12, 14 * SIZE(B) STFD c16, 15 * SIZE(B) -#ifdef POWER6 +#if defined(POWER6) || defined(POWER8) dcbtst PREA, AO1 dcbtst PREA, AO2 dcbtst PREA, AO3 diff --git a/kernel/power/gemm_tcopy_4.S b/kernel/power/gemm_tcopy_4.S index 46b1cd9..3051344 100644 --- a/kernel/power/gemm_tcopy_4.S +++ b/kernel/power/gemm_tcopy_4.S @@ -108,12 +108,12 @@ #define PREFETCHWSIZE 48 #endif -#ifdef POWER8 +#ifdef PPCG4 #define PREFETCHSIZE 16 #define PREFETCHWSIZE 48 #endif -#ifdef PPCG4 +#ifdef POWER8 #define PREFETCHSIZE 16 #define PREFETCHWSIZE 48 #endif @@ -229,7 +229,7 @@ LL(12): STFD c15, 14 * SIZE(B1) STFD c16, 15 * SIZE(B1) -#ifdef POWER6 +#if defined(POWER6) || defined(POWER8) dcbtst PREA, AO1 dcbtst PREA, AO2 dcbtst PREA, AO3 diff --git a/kernel/power/gemv_n.S b/kernel/power/gemv_n.S index 5c46c43..77587ec 100644 --- a/kernel/power/gemv_n.S +++ b/kernel/power/gemv_n.S @@ -174,11 +174,6 @@ #define PREFETCHSIZE_C 40 #endif -#ifdef POWER8 -#define PREFETCHSIZE_A 96 -#define PREFETCHSIZE_C 40 -#endif - #ifndef NEEDPARAM #ifndef __64BIT__ diff --git a/kernel/power/gemv_t.S b/kernel/power/gemv_t.S index 4577530..817a60b 100644 --- a/kernel/power/gemv_t.S +++ b/kernel/power/gemv_t.S @@ -139,11 +139,6 @@ #define PREFETCHSIZE_C 8 #endif -#ifdef POWER8 -#define PREFETCHSIZE_A 96 -#define PREFETCHSIZE_C 8 -#endif - #define y01 f0 #define y02 f1 #define y03 f2 diff --git a/kernel/power/symv_L.S b/kernel/power/symv_L.S index 9f759c3..f7d768c 100644 --- a/kernel/power/symv_L.S +++ b/kernel/power/symv_L.S @@ -168,11 +168,7 @@ #define PREFETCHSIZE_A 40 #endif -#ifdef POWER8 -#define PREFETCHSIZE_A 40 -#endif - -#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970) || defined(POWER8) +#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970) #define NOP1 #define NOP2 #else diff --git a/kernel/power/symv_U.S b/kernel/power/symv_U.S index e4e419b..d8e0823 100644 --- a/kernel/power/symv_U.S +++ b/kernel/power/symv_U.S @@ -167,11 +167,7 @@ #define PREFETCHSIZE_A 40 #endif -#ifdef POWER8 -#define PREFETCHSIZE_A 40 -#endif - -#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970) || defined(POWER8) +#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970) #define NOP1 #define NOP2 #else diff --git a/kernel/power/zgemm_kernel_8x2_power8.S b/kernel/power/zgemm_kernel_8x2_power8.S new file mode 100644 index 0000000..03957f4 --- /dev/null +++ b/kernel/power/zgemm_kernel_8x2_power8.S @@ -0,0 +1,332 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA_R_SP 296(SP) +#define ALPHA_I_SP 304(SP) +#define FZERO 312(SP) +#else +#define STACKSIZE 256 +#define ALPHA_R_SP 224(SP) +#define ALPHA_I_SP 232(SP) +#define FZERO 240(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r10 +#define B r6 +#define C r7 +#define LDC r8 +#define OFFSET r9 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#define o0 0 +#define alpha_r vs30 +#define alpha_i vs31 + +#define L r15 +#define ALPHA r16 +#define o24 r17 +#define T2 r19 +#define KK r20 +#define o8 r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define o16 r27 +#define o32 r28 +#define o48 r29 + +#define PRE r30 +#define T1 r31 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) + stw r19, 192(SP) + stw r18, 196(SP) + stw r17, 200(SP) + stw r16, 204(SP) + stw r15, 208(SP) +#endif + + stfd f1, ALPHA_R_SP + stfd f2, ALPHA_I_SP + stw r0, FZERO + +#ifdef linux +#ifdef __64BIT__ + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz B, FRAMESLOT(0) + STACKSIZE(SP) + lwz C, FRAMESLOT(1) + STACKSIZE(SP) + lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) +#else + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) +#endif +#endif +#endif + +#ifdef TRMMKERNEL +#if defined(linux) && defined(__64BIT__) + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP) +#else + lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) +#endif +#endif +#endif +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, OFFSET +#endif +#endif + +#include "zgemm_macros_8x2_power8.S" + + cmpwi cr0, M, 0 + ble L999 + cmpwi cr0, N, 0 + ble L999 + cmpwi cr0, K, 0 + ble L999 + + slwi LDC, LDC, ZBASE_SHIFT + li PRE, 256 + li o8 , 8 + li o16 , 16 + li o24 , 24 + li o32 , 32 + li o48 , 48 + +#ifdef __64BIT__ + addi ALPHA, SP, 296 +#else + addi ALPHA, SP, 224 +#endif + + lxvdsx alpha_r, 0, ALPHA + lxvdsx alpha_i, o8, ALPHA + + .align 5 + +#include "zgemm_logic_8x2_power8.S" + +L999: + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) + lwz r19, 192(SP) + lwz r18, 196(SP) + lwz r17, 200(SP) + lwz r16, 204(SP) + lwz r15, 208(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/zgemm_logic_8x2_power8.S b/kernel/power/zgemm_logic_8x2_power8.S new file mode 100644 index 0000000..e829fd6 --- /dev/null +++ b/kernel/power/zgemm_logic_8x2_power8.S @@ -0,0 +1,901 @@ + srawi. J, N, 1 + ble ZGEMM_L2_END + +ZGEMM_L2_BEGIN: + + mr CO, C + mr AO, A + slwi T1, LDC , 1 + add C, C, T1 + srawi. I, M, 3 + ble ZGEMM_L2x8_END + +ZGEMM_L2x8_BEGIN: + + + mr BO, B + srawi. L, K, 3 + ble ZGEMM_L2x8_SUB0 + cmpwi cr0, L, 1 + ble ZGEMM_L2x8_SUB4 + +ZGEMM_L2x8_LOOP_START: + + dcbt AO, PRE + LOAD2x8_1 + dcbt AO, PRE + KERNEL2x8_I1 + dcbt AO, PRE + KERNEL2x8_2 + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + + addic. L, L, -2 + ble ZGEMM_L2x8_LOOP_END + + .align 5 + +ZGEMM_L2x8_LOOP: + + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + + addic. L, L, -1 + bgt ZGEMM_L2x8_LOOP + +ZGEMM_L2x8_LOOP_END: + + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + dcbt AO, PRE + KERNEL2x8_1 + KERNEL2x8_E2 + + b ZGEMM_L2x8_SUB1 + +ZGEMM_L2x8_SUB4: + + dcbt AO, PRE + KERNEL2x8_SUBI1 + dcbt AO, PRE + KERNEL2x8_SUB1 + dcbt AO, PRE + KERNEL2x8_SUB1 + dcbt AO, PRE + KERNEL2x8_SUB1 + + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + + b ZGEMM_L2x8_SUB1 + +ZGEMM_L2x8_SUB0: + + andi. L, K, 7 + + KERNEL2x8_SUBI1 + + addic. L, L, -1 + ble ZGEMM_L2x8_SAVE + b ZGEMM_L2x8_SUB2 + +ZGEMM_L2x8_SUB1: + + andi. L, K, 7 + ble ZGEMM_L2x8_SAVE + +ZGEMM_L2x8_SUB2: + + KERNEL2x8_SUB1 + + addic. L, L, -1 + bgt ZGEMM_L2x8_SUB2 + +ZGEMM_L2x8_SAVE: + + SAVE2x8 + + addic. I, I, -1 + bgt ZGEMM_L2x8_BEGIN + +ZGEMM_L2x8_END: + +ZGEMM_L2x4_BEGIN: + + andi. T2, M, 7 + ble ZGEMM_L2x1_END + + andi. T1, M, 4 + ble ZGEMM_L2x4_END + mr BO, B + srawi. L, K, 3 + ble ZGEMM_L2x4_SUB0 + cmpwi cr0, L, 1 + ble ZGEMM_L2x4_SUB4 + +ZGEMM_L2x4_LOOP_START: + + LOAD2x4_1 + KERNEL2x4_I1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -2 + ble ZGEMM_L2x4_LOOP_END + + .align 5 + +ZGEMM_L2x4_LOOP: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -1 + bgt ZGEMM_L2x4_LOOP + +ZGEMM_L2x4_LOOP_END: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_E2 + + b ZGEMM_L2x4_SUB1 + +ZGEMM_L2x4_SUB4: + + KERNEL2x4_SUBI1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + b ZGEMM_L2x4_SUB1 + +ZGEMM_L2x4_SUB0: + + andi. L, K, 7 + + KERNEL2x4_SUBI1 + + addic. L, L, -1 + ble ZGEMM_L2x4_SAVE + b ZGEMM_L2x4_SUB2 + +ZGEMM_L2x4_SUB1: + + andi. L, K, 7 + ble ZGEMM_L2x4_SAVE + +ZGEMM_L2x4_SUB2: + + KERNEL2x4_SUB1 + + addic. L, L, -1 + bgt ZGEMM_L2x4_SUB2 + +ZGEMM_L2x4_SAVE: + + SAVE2x4 + +ZGEMM_L2x4_END: + +ZGEMM_L2x2_BEGIN: + + + andi. T1, M, 2 + ble ZGEMM_L2x2_END + mr BO, B + srawi. L, K, 3 + ble ZGEMM_L2x2_SUB0 + cmpwi cr0, L, 1 + ble ZGEMM_L2x2_SUB4 + +ZGEMM_L2x2_LOOP_START: + + LOAD2x2_1 + KERNEL2x2_I1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -2 + ble ZGEMM_L2x2_LOOP_END + + .align 5 + +ZGEMM_L2x2_LOOP: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -1 + bgt ZGEMM_L2x2_LOOP + +ZGEMM_L2x2_LOOP_END: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_E2 + + b ZGEMM_L2x2_SUB1 + +ZGEMM_L2x2_SUB4: + + KERNEL2x2_SUBI1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + b ZGEMM_L2x2_SUB1 + +ZGEMM_L2x2_SUB0: + + andi. L, K, 7 + + KERNEL2x2_SUBI1 + + addic. L, L, -1 + ble ZGEMM_L2x2_SAVE + b ZGEMM_L2x2_SUB2 + +ZGEMM_L2x2_SUB1: + + andi. L, K, 7 + ble ZGEMM_L2x2_SAVE + +ZGEMM_L2x2_SUB2: + + KERNEL2x2_SUB1 + + addic. L, L, -1 + bgt ZGEMM_L2x2_SUB2 + +ZGEMM_L2x2_SAVE: + + SAVE2x2 + +ZGEMM_L2x2_END: + +ZGEMM_L2x1_BEGIN: + + + andi. T1, M, 1 + ble ZGEMM_L2x1_END + mr BO, B + srawi. L, K, 3 + ble ZGEMM_L2x1_SUB0 + cmpwi cr0, L, 1 + ble ZGEMM_L2x1_SUB4 + +ZGEMM_L2x1_LOOP_START: + + LOAD2x1_1 + KERNEL2x1_I1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -2 + ble ZGEMM_L2x1_LOOP_END + + .align 5 + +ZGEMM_L2x1_LOOP: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -1 + bgt ZGEMM_L2x1_LOOP + +ZGEMM_L2x1_LOOP_END: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_E2 + + b ZGEMM_L2x1_SUB1 + +ZGEMM_L2x1_SUB4: + + KERNEL2x1_SUBI1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + b ZGEMM_L2x1_SUB1 + +ZGEMM_L2x1_SUB0: + + andi. L, K, 7 + + KERNEL2x1_SUBI1 + + addic. L, L, -1 + ble ZGEMM_L2x1_SAVE + b ZGEMM_L2x1_SUB2 + +ZGEMM_L2x1_SUB1: + + andi. L, K, 7 + ble ZGEMM_L2x1_SAVE + +ZGEMM_L2x1_SUB2: + + KERNEL2x1_SUB1 + + addic. L, L, -1 + bgt ZGEMM_L2x1_SUB2 + +ZGEMM_L2x1_SAVE: + + SAVE2x1 + +ZGEMM_L2x1_END: + + slwi T1, K, 5 + add B, B, T1 + + addic. J, J, -1 + bgt ZGEMM_L2_BEGIN + + andi. T2, N, 1 + ble L999 + +ZGEMM_L2_END: + + b ZGEMM_L1_BEGIN + +L999_H1: + + b L999 + +ZGEMM_L1_BEGIN: + + andi. T1, N, 1 + ble ZGEMM_L1_END + mr CO, C + mr AO, A + srawi. I, M, 3 + ble ZGEMM_L1x8_END + +ZGEMM_L1x8_BEGIN: + + + mr BO, B + srawi. L, K, 3 + ble ZGEMM_L1x8_SUB0 + cmpwi cr0, L, 1 + ble ZGEMM_L1x8_SUB4 + +ZGEMM_L1x8_LOOP_START: + + dcbt AO, PRE + LOAD1x8_1 + dcbt AO, PRE + KERNEL1x8_I1 + dcbt AO, PRE + KERNEL1x8_2 + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + + addic. L, L, -2 + ble ZGEMM_L1x8_LOOP_END + + .align 5 + +ZGEMM_L1x8_LOOP: + + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + + addic. L, L, -1 + bgt ZGEMM_L1x8_LOOP + +ZGEMM_L1x8_LOOP_END: + + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + dcbt AO, PRE + KERNEL1x8_1 + KERNEL1x8_E2 + + b ZGEMM_L1x8_SUB1 + +ZGEMM_L1x8_SUB4: + + dcbt AO, PRE + KERNEL1x8_SUBI1 + dcbt AO, PRE + KERNEL1x8_SUB1 + dcbt AO, PRE + KERNEL1x8_SUB1 + dcbt AO, PRE + KERNEL1x8_SUB1 + + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + + b ZGEMM_L1x8_SUB1 + +ZGEMM_L1x8_SUB0: + + andi. L, K, 7 + + KERNEL1x8_SUBI1 + + addic. L, L, -1 + ble ZGEMM_L1x8_SAVE + b ZGEMM_L1x8_SUB2 + +ZGEMM_L1x8_SUB1: + + andi. L, K, 7 + ble ZGEMM_L1x8_SAVE + +ZGEMM_L1x8_SUB2: + + KERNEL1x8_SUB1 + + addic. L, L, -1 + bgt ZGEMM_L1x8_SUB2 + +ZGEMM_L1x8_SAVE: + + SAVE1x8 + + addic. I, I, -1 + bgt ZGEMM_L1x8_BEGIN + +ZGEMM_L1x8_END: + +ZGEMM_L1x4_BEGIN: + + andi. T2, M, 7 + ble ZGEMM_L1x1_END + + andi. T1, M, 4 + ble ZGEMM_L1x4_END + mr BO, B + srawi. L, K, 3 + ble ZGEMM_L1x4_SUB0 + cmpwi cr0, L, 1 + ble ZGEMM_L1x4_SUB4 + +ZGEMM_L1x4_LOOP_START: + + LOAD1x4_1 + KERNEL1x4_I1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -2 + ble ZGEMM_L1x4_LOOP_END + + .align 5 + +ZGEMM_L1x4_LOOP: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -1 + bgt ZGEMM_L1x4_LOOP + +ZGEMM_L1x4_LOOP_END: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_E2 + + b ZGEMM_L1x4_SUB1 + +ZGEMM_L1x4_SUB4: + + KERNEL1x4_SUBI1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + b ZGEMM_L1x4_SUB1 + +ZGEMM_L1x4_SUB0: + + andi. L, K, 7 + + KERNEL1x4_SUBI1 + + addic. L, L, -1 + ble ZGEMM_L1x4_SAVE + b ZGEMM_L1x4_SUB2 + +ZGEMM_L1x4_SUB1: + + andi. L, K, 7 + ble ZGEMM_L1x4_SAVE + +ZGEMM_L1x4_SUB2: + + KERNEL1x4_SUB1 + + addic. L, L, -1 + bgt ZGEMM_L1x4_SUB2 + +ZGEMM_L1x4_SAVE: + + SAVE1x4 + +ZGEMM_L1x4_END: + +ZGEMM_L1x2_BEGIN: + + + andi. T1, M, 2 + ble ZGEMM_L1x2_END + mr BO, B + srawi. L, K, 3 + ble ZGEMM_L1x2_SUB0 + cmpwi cr0, L, 1 + ble ZGEMM_L1x2_SUB4 + +ZGEMM_L1x2_LOOP_START: + + LOAD1x2_1 + KERNEL1x2_I1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -2 + ble ZGEMM_L1x2_LOOP_END + + .align 5 + +ZGEMM_L1x2_LOOP: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -1 + bgt ZGEMM_L1x2_LOOP + +ZGEMM_L1x2_LOOP_END: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_E2 + + b ZGEMM_L1x2_SUB1 + +ZGEMM_L1x2_SUB4: + + KERNEL1x2_SUBI1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + b ZGEMM_L1x2_SUB1 + +ZGEMM_L1x2_SUB0: + + andi. L, K, 7 + + KERNEL1x2_SUBI1 + + addic. L, L, -1 + ble ZGEMM_L1x2_SAVE + b ZGEMM_L1x2_SUB2 + +ZGEMM_L1x2_SUB1: + + andi. L, K, 7 + ble ZGEMM_L1x2_SAVE + +ZGEMM_L1x2_SUB2: + + KERNEL1x2_SUB1 + + addic. L, L, -1 + bgt ZGEMM_L1x2_SUB2 + +ZGEMM_L1x2_SAVE: + + SAVE1x2 + +ZGEMM_L1x2_END: + +ZGEMM_L1x1_BEGIN: + + + andi. T1, M, 1 + ble ZGEMM_L1x1_END + mr BO, B + srawi. L, K, 3 + ble ZGEMM_L1x1_SUB0 + cmpwi cr0, L, 1 + ble ZGEMM_L1x1_SUB4 + +ZGEMM_L1x1_LOOP_START: + + LOAD1x1_1 + KERNEL1x1_I1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -2 + ble ZGEMM_L1x1_LOOP_END + + .align 5 + +ZGEMM_L1x1_LOOP: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -1 + bgt ZGEMM_L1x1_LOOP + +ZGEMM_L1x1_LOOP_END: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_E2 + + b ZGEMM_L1x1_SUB1 + +ZGEMM_L1x1_SUB4: + + KERNEL1x1_SUBI1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + b ZGEMM_L1x1_SUB1 + +ZGEMM_L1x1_SUB0: + + andi. L, K, 7 + + KERNEL1x1_SUBI1 + + addic. L, L, -1 + ble ZGEMM_L1x1_SAVE + b ZGEMM_L1x1_SUB2 + +ZGEMM_L1x1_SUB1: + + andi. L, K, 7 + ble ZGEMM_L1x1_SAVE + +ZGEMM_L1x1_SUB2: + + KERNEL1x1_SUB1 + + addic. L, L, -1 + bgt ZGEMM_L1x1_SUB2 + +ZGEMM_L1x1_SAVE: + + SAVE1x1 + +ZGEMM_L1x1_END: + +ZGEMM_L1_END: diff --git a/kernel/power/zgemm_macros_8x2_power8.S b/kernel/power/zgemm_macros_8x2_power8.S new file mode 100644 index 0000000..3e5ea9c --- /dev/null +++ b/kernel/power/zgemm_macros_8x2_power8.S @@ -0,0 +1,3074 @@ +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + #define XSFADD_R1 xsadddp + #define XSFADD_R2 xssubdp + #define XSFADD_I1 xsadddp + #define XSFADD_I2 xsadddp + +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + + #define XSFADD_R1 xsadddp + #define XSFADD_R2 xsadddp + #define XSFADD_I1 xssubdp + #define XSFADD_I2 xsadddp + +#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) + + #define XSFADD_R1 xsadddp + #define XSFADD_R2 xsadddp + #define XSFADD_I1 xsadddp + #define XSFADD_I2 xssubdp + +#else // CC || CR || RC || RR + + #define XSFADD_R1 xsadddp + #define XSFADD_R2 xssubdp + #define XSFADD_I1 xssubdp + #define XSFADD_I2 xssubdp + +#endif + +/********************************************************************************************** +* Macros for N=2 and M=8 +**********************************************************************************************/ + +.macro LOAD2x8_1 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvd2x vs4, o0, AO // load real,imag from A + lxvd2x vs5, o16, AO // load real,imag from A + lxvd2x vs6, o32, AO // load real,imag from A + lxvd2x vs7, o48, AO // load real,imag from A + + addi AO, AO, 64 + + +.endm + +.macro KERNEL2x8_I1 + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + lxvd2x vs10, o32, AO // load real,imag from A + lxvd2x vs11, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvd2x vs12, o0, AO // load real,imag from A + lxvd2x vs13, o16, AO // load real,imag from A + lxvd2x vs14, o32, AO // load real,imag from A + lxvd2x vs15, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + lxvdsx vs22, o16, BO // load real part from B + lxvdsx vs23, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + xvmuldp vs36, vs2, vs16 // real*real, imag*real + xvmuldp vs37, vs2, vs17 // real*imag, imag*imag + xvmuldp vs38, vs3, vs16 // real*real, imag*real + xvmuldp vs39, vs3, vs17 // real*imag, imag*imag + xvmuldp vs40, vs4, vs16 // real*real, imag*real + xvmuldp vs41, vs4, vs17 // real*imag, imag*imag + xvmuldp vs42, vs5, vs16 // real*real, imag*real + xvmuldp vs43, vs5, vs17 // real*imag, imag*imag + xvmuldp vs44, vs6, vs16 // real*real, imag*real + xvmuldp vs45, vs6, vs17 // real*imag, imag*imag + xvmuldp vs46, vs7, vs16 // real*real, imag*real + xvmuldp vs47, vs7, vs17 // real*imag, imag*imag + + xvmuldp vs48, vs0, vs18 // real*real, imag*real + xvmuldp vs49, vs0, vs19 // real*imag, imag*imag + xvmuldp vs50, vs1, vs18 // real*real, imag*real + xvmuldp vs51, vs1, vs19 // real*imag, imag*imag + xvmuldp vs52, vs2, vs18 // real*real, imag*real + xvmuldp vs53, vs2, vs19 // real*imag, imag*imag + xvmuldp vs54, vs3, vs18 // real*real, imag*real + xvmuldp vs55, vs3, vs19 // real*imag, imag*imag + xvmuldp vs56, vs4, vs18 // real*real, imag*real + xvmuldp vs57, vs4, vs19 // real*imag, imag*imag + xvmuldp vs58, vs5, vs18 // real*real, imag*real + xvmuldp vs59, vs5, vs19 // real*imag, imag*imag + xvmuldp vs60, vs6, vs18 // real*real, imag*real + xvmuldp vs61, vs6, vs19 // real*imag, imag*imag + xvmuldp vs62, vs7, vs18 // real*real, imag*real + xvmuldp vs63, vs7, vs19 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x8_1 + + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + + lxvdsx vs22, o16, BO // load real part from B + lxvdsx vs23, o24, BO // load imag part from B + + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + + xvmaddadp vs40, vs4, vs16 // real*real, imag*real + xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag + xvmaddadp vs42, vs5, vs16 // real*real, imag*real + xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag + + lxvd2x vs10, o32, AO // load real,imag from A + lxvd2x vs11, o48, AO // load real,imag from A + + xvmaddadp vs44, vs6, vs16 // real*real, imag*real + xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag + + addi AO, AO, 64 + + xvmaddadp vs46, vs7, vs16 // real*real, imag*real + xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag + + xvmaddadp vs48, vs0, vs18 // real*real, imag*real + xvmaddadp vs49, vs0, vs19 // real*imag, imag*imag + xvmaddadp vs50, vs1, vs18 // real*real, imag*real + xvmaddadp vs51, vs1, vs19 // real*imag, imag*imag + + lxvd2x vs12, o0, AO // load real,imag from A + lxvd2x vs13, o16, AO // load real,imag from A + + xvmaddadp vs52, vs2, vs18 // real*real, imag*real + xvmaddadp vs53, vs2, vs19 // real*imag, imag*imag + xvmaddadp vs54, vs3, vs18 // real*real, imag*real + xvmaddadp vs55, vs3, vs19 // real*imag, imag*imag + + lxvd2x vs14, o32, AO // load real,imag from A + lxvd2x vs15, o48, AO // load real,imag from A + + xvmaddadp vs56, vs4, vs18 // real*real, imag*real + xvmaddadp vs57, vs4, vs19 // real*imag, imag*imag + xvmaddadp vs58, vs5, vs18 // real*real, imag*real + xvmaddadp vs59, vs5, vs19 // real*imag, imag*imag + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + + xvmaddadp vs60, vs6, vs18 // real*real, imag*real + xvmaddadp vs61, vs6, vs19 // real*imag, imag*imag + xvmaddadp vs62, vs7, vs18 // real*real, imag*real + xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag + + addi AO, AO, 64 + addi BO, BO, 32 + +.endm + +.macro KERNEL2x8_2 + + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + xvmaddadp vs36, vs10, vs20 // real*real, imag*real + xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag + xvmaddadp vs38, vs11, vs20 // real*real, imag*real + xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + + xvmaddadp vs40, vs12, vs20 // real*real, imag*real + xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag + xvmaddadp vs42, vs13, vs20 // real*real, imag*real + xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag + + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + xvmaddadp vs44, vs14, vs20 // real*real, imag*real + xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag + xvmaddadp vs46, vs15, vs20 // real*real, imag*real + xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag + + addi AO, AO, 64 + + xvmaddadp vs48, vs8, vs22 // real*real, imag*real + xvmaddadp vs49, vs8, vs23 // real*imag, imag*imag + xvmaddadp vs50, vs9, vs22 // real*real, imag*real + xvmaddadp vs51, vs9, vs23 // real*imag, imag*imag + + lxvd2x vs4, o0, AO // load real,imag from A + lxvd2x vs5, o16, AO // load real,imag from A + + xvmaddadp vs52, vs10, vs22 // real*real, imag*real + xvmaddadp vs53, vs10, vs23 // real*imag, imag*imag + xvmaddadp vs54, vs11, vs22 // real*real, imag*real + xvmaddadp vs55, vs11, vs23 // real*imag, imag*imag + + lxvd2x vs6, o32, AO // load real,imag from A + lxvd2x vs7, o48, AO // load real,imag from A + + xvmaddadp vs56, vs12, vs22 // real*real, imag*real + xvmaddadp vs57, vs12, vs23 // real*imag, imag*imag + xvmaddadp vs58, vs13, vs22 // real*real, imag*real + xvmaddadp vs59, vs13, vs23 // real*imag, imag*imag + + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + xvmaddadp vs60, vs14, vs22 // real*real, imag*real + xvmaddadp vs61, vs14, vs23 // real*imag, imag*imag + xvmaddadp vs62, vs15, vs22 // real*real, imag*real + xvmaddadp vs63, vs15, vs23 // real*imag, imag*imag + + addi AO, AO, 64 + addi BO, BO, 32 + +.endm + +.macro KERNEL2x8_E2 + + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + xvmaddadp vs36, vs10, vs20 // real*real, imag*real + xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag + xvmaddadp vs38, vs11, vs20 // real*real, imag*real + xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag + xvmaddadp vs40, vs12, vs20 // real*real, imag*real + xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag + xvmaddadp vs42, vs13, vs20 // real*real, imag*real + xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag + xvmaddadp vs44, vs14, vs20 // real*real, imag*real + xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag + xvmaddadp vs46, vs15, vs20 // real*real, imag*real + xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag + + xvmaddadp vs48, vs8, vs22 // real*real, imag*real + xvmaddadp vs49, vs8, vs23 // real*imag, imag*imag + xvmaddadp vs50, vs9, vs22 // real*real, imag*real + xvmaddadp vs51, vs9, vs23 // real*imag, imag*imag + xvmaddadp vs52, vs10, vs22 // real*real, imag*real + xvmaddadp vs53, vs10, vs23 // real*imag, imag*imag + xvmaddadp vs54, vs11, vs22 // real*real, imag*real + xvmaddadp vs55, vs11, vs23 // real*imag, imag*imag + xvmaddadp vs56, vs12, vs22 // real*real, imag*real + xvmaddadp vs57, vs12, vs23 // real*imag, imag*imag + xvmaddadp vs58, vs13, vs22 // real*real, imag*real + xvmaddadp vs59, vs13, vs23 // real*imag, imag*imag + xvmaddadp vs60, vs14, vs22 // real*real, imag*real + xvmaddadp vs61, vs14, vs23 // real*imag, imag*imag + xvmaddadp vs62, vs15, vs22 // real*real, imag*real + xvmaddadp vs63, vs15, vs23 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x8_SUBI1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvd2x vs4, o0, AO // load real,imag from A + lxvd2x vs5, o16, AO // load real,imag from A + lxvd2x vs6, o32, AO // load real,imag from A + lxvd2x vs7, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + xvmuldp vs36, vs2, vs16 // real*real, imag*real + xvmuldp vs37, vs2, vs17 // real*imag, imag*imag + xvmuldp vs38, vs3, vs16 // real*real, imag*real + xvmuldp vs39, vs3, vs17 // real*imag, imag*imag + xvmuldp vs40, vs4, vs16 // real*real, imag*real + xvmuldp vs41, vs4, vs17 // real*imag, imag*imag + xvmuldp vs42, vs5, vs16 // real*real, imag*real + xvmuldp vs43, vs5, vs17 // real*imag, imag*imag + xvmuldp vs44, vs6, vs16 // real*real, imag*real + xvmuldp vs45, vs6, vs17 // real*imag, imag*imag + xvmuldp vs46, vs7, vs16 // real*real, imag*real + xvmuldp vs47, vs7, vs17 // real*imag, imag*imag + + xvmuldp vs48, vs0, vs18 // real*real, imag*real + xvmuldp vs49, vs0, vs19 // real*imag, imag*imag + xvmuldp vs50, vs1, vs18 // real*real, imag*real + xvmuldp vs51, vs1, vs19 // real*imag, imag*imag + xvmuldp vs52, vs2, vs18 // real*real, imag*real + xvmuldp vs53, vs2, vs19 // real*imag, imag*imag + xvmuldp vs54, vs3, vs18 // real*real, imag*real + xvmuldp vs55, vs3, vs19 // real*imag, imag*imag + xvmuldp vs56, vs4, vs18 // real*real, imag*real + xvmuldp vs57, vs4, vs19 // real*imag, imag*imag + xvmuldp vs58, vs5, vs18 // real*real, imag*real + xvmuldp vs59, vs5, vs19 // real*imag, imag*imag + xvmuldp vs60, vs6, vs18 // real*real, imag*real + xvmuldp vs61, vs6, vs19 // real*imag, imag*imag + xvmuldp vs62, vs7, vs18 // real*real, imag*real + xvmuldp vs63, vs7, vs19 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x8_SUB1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvd2x vs4, o0, AO // load real,imag from A + lxvd2x vs5, o16, AO // load real,imag from A + lxvd2x vs6, o32, AO // load real,imag from A + lxvd2x vs7, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + xvmaddadp vs40, vs4, vs16 // real*real, imag*real + xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag + xvmaddadp vs42, vs5, vs16 // real*real, imag*real + xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag + xvmaddadp vs44, vs6, vs16 // real*real, imag*real + xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag + xvmaddadp vs46, vs7, vs16 // real*real, imag*real + xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag + + xvmaddadp vs48, vs0, vs18 // real*real, imag*real + xvmaddadp vs49, vs0, vs19 // real*imag, imag*imag + xvmaddadp vs50, vs1, vs18 // real*real, imag*real + xvmaddadp vs51, vs1, vs19 // real*imag, imag*imag + xvmaddadp vs52, vs2, vs18 // real*real, imag*real + xvmaddadp vs53, vs2, vs19 // real*imag, imag*imag + xvmaddadp vs54, vs3, vs18 // real*real, imag*real + xvmaddadp vs55, vs3, vs19 // real*imag, imag*imag + xvmaddadp vs56, vs4, vs18 // real*real, imag*real + xvmaddadp vs57, vs4, vs19 // real*imag, imag*imag + xvmaddadp vs58, vs5, vs18 // real*real, imag*real + xvmaddadp vs59, vs5, vs19 // real*imag, imag*imag + xvmaddadp vs60, vs6, vs18 // real*real, imag*real + xvmaddadp vs61, vs6, vs19 // real*imag, imag*imag + xvmaddadp vs62, vs7, vs18 // real*real, imag*real + xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag + + +.endm + +.macro SAVE2x8 + + + mr T1, CO + addi T2, T1, 64 + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + lxvd2x vs17, o16, T1 + lxvd2x vs18, o32, T1 + lxvd2x vs19, o48, T1 + lxvd2x vs20, o0, T2 + lxvd2x vs21, o16, T2 + lxvd2x vs22, o32, T2 + lxvd2x vs23, o48, T2 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs32 // realA*realB + XSFADD_R2 vs0, vs0, vs33 // imagA*imagB + + xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs32 // realA*imagB + XSFADD_I2 vs1, vs1, vs33 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs34 // realA*realB + XSFADD_R2 vs0, vs0, vs35 // imagA*imagB + + xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs34 // realA*imagB + XSFADD_I2 vs1, vs1, vs35 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs9, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs36 // realA*realB + XSFADD_R2 vs0, vs0, vs37 // imagA*imagB + + xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs36 // realA*imagB + XSFADD_I2 vs1, vs1, vs37 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs10, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs38 // realA*realB + XSFADD_R2 vs0, vs0, vs39 // imagA*imagB + + xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs38 // realA*imagB + XSFADD_I2 vs1, vs1, vs39 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs11, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs41, vs41 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs40 // realA*realB + XSFADD_R2 vs0, vs0, vs41 // imagA*imagB + + xxswapd vs40, vs40 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs41, vs41 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs40 // realA*imagB + XSFADD_I2 vs1, vs1, vs41 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs12, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs43, vs43 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs42 // realA*realB + XSFADD_R2 vs0, vs0, vs43 // imagA*imagB + + xxswapd vs42, vs42 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs43, vs43 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs42 // realA*imagB + XSFADD_I2 vs1, vs1, vs43 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs13, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs45, vs45 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs44 // realA*realB + XSFADD_R2 vs0, vs0, vs45 // imagA*imagB + + xxswapd vs44, vs44 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs45, vs45 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs44 // realA*imagB + XSFADD_I2 vs1, vs1, vs45 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs14, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs47, vs47 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs46 // realA*realB + XSFADD_R2 vs0, vs0, vs47 // imagA*imagB + + xxswapd vs46, vs46 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs47, vs47 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs46 // realA*imagB + XSFADD_I2 vs1, vs1, vs47 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs15, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + xvadddp vs10, vs10, vs18 + xvadddp vs11, vs11, vs19 + xvadddp vs12, vs12, vs20 + xvadddp vs13, vs13, vs21 + xvadddp vs14, vs14, vs22 + xvadddp vs15, vs15, vs23 + +#endif + + stxvd2x vs8, o0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + stxvd2x vs12, o0, T2 + stxvd2x vs13, o16, T2 + stxvd2x vs14, o32, T2 + stxvd2x vs15, o48, T2 + + add T1, T1, LDC + add T2, T2, LDC + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + lxvd2x vs17, o16, T1 + lxvd2x vs18, o32, T1 + lxvd2x vs19, o48, T1 + lxvd2x vs20, o0, T2 + lxvd2x vs21, o16, T2 + lxvd2x vs22, o32, T2 + lxvd2x vs23, o48, T2 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs49, vs49 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs48 // realA*realB + XSFADD_R2 vs0, vs0, vs49 // imagA*imagB + + xxswapd vs48, vs48 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs49, vs49 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs48 // realA*imagB + XSFADD_I2 vs1, vs1, vs49 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs51, vs51 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs50 // realA*realB + XSFADD_R2 vs0, vs0, vs51 // imagA*imagB + + xxswapd vs50, vs50 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs51, vs51 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs50 // realA*imagB + XSFADD_I2 vs1, vs1, vs51 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs9, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs53, vs53 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs52 // realA*realB + XSFADD_R2 vs0, vs0, vs53 // imagA*imagB + + xxswapd vs52, vs52 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs53, vs53 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs52 // realA*imagB + XSFADD_I2 vs1, vs1, vs53 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs10, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs55, vs55 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs54 // realA*realB + XSFADD_R2 vs0, vs0, vs55 // imagA*imagB + + xxswapd vs54, vs54 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs55, vs55 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs54 // realA*imagB + XSFADD_I2 vs1, vs1, vs55 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs11, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs57, vs57 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs56 // realA*realB + XSFADD_R2 vs0, vs0, vs57 // imagA*imagB + + xxswapd vs56, vs56 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs57, vs57 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs56 // realA*imagB + XSFADD_I2 vs1, vs1, vs57 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs12, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs59, vs59 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs58 // realA*realB + XSFADD_R2 vs0, vs0, vs59 // imagA*imagB + + xxswapd vs58, vs58 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs59, vs59 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs58 // realA*imagB + XSFADD_I2 vs1, vs1, vs59 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs13, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs61, vs61 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs60 // realA*realB + XSFADD_R2 vs0, vs0, vs61 // imagA*imagB + + xxswapd vs60, vs60 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs61, vs61 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs60 // realA*imagB + XSFADD_I2 vs1, vs1, vs61 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs14, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs63, vs63 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs62 // realA*realB + XSFADD_R2 vs0, vs0, vs63 // imagA*imagB + + xxswapd vs62, vs62 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs63, vs63 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs62 // realA*imagB + XSFADD_I2 vs1, vs1, vs63 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs15, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + xvadddp vs10, vs10, vs18 + xvadddp vs11, vs11, vs19 + xvadddp vs12, vs12, vs20 + xvadddp vs13, vs13, vs21 + xvadddp vs14, vs14, vs22 + xvadddp vs15, vs15, vs23 + +#endif + + stxvd2x vs8, o0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + stxvd2x vs12, o0, T2 + stxvd2x vs13, o16, T2 + stxvd2x vs14, o32, T2 + stxvd2x vs15, o48, T2 + + add T1, T1, LDC + add T2, T2, LDC + addi CO, CO, 128 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=4 +**********************************************************************************************/ + +.macro LOAD2x4_1 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + +.endm + +.macro KERNEL2x4_I1 + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + lxvd2x vs10, o32, AO // load real,imag from A + lxvd2x vs11, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + lxvdsx vs22, o16, BO // load real part from B + lxvdsx vs23, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + xvmuldp vs36, vs2, vs16 // real*real, imag*real + xvmuldp vs37, vs2, vs17 // real*imag, imag*imag + xvmuldp vs38, vs3, vs16 // real*real, imag*real + xvmuldp vs39, vs3, vs17 // real*imag, imag*imag + + xvmuldp vs40, vs0, vs18 // real*real, imag*real + xvmuldp vs41, vs0, vs19 // real*imag, imag*imag + xvmuldp vs42, vs1, vs18 // real*real, imag*real + xvmuldp vs43, vs1, vs19 // real*imag, imag*imag + xvmuldp vs44, vs2, vs18 // real*real, imag*real + xvmuldp vs45, vs2, vs19 // real*imag, imag*imag + xvmuldp vs46, vs3, vs18 // real*real, imag*real + xvmuldp vs47, vs3, vs19 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x4_1 + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + lxvd2x vs10, o32, AO // load real,imag from A + lxvd2x vs11, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + lxvdsx vs22, o16, BO // load real part from B + lxvdsx vs23, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + + xvmaddadp vs40, vs0, vs18 // real*real, imag*real + xvmaddadp vs41, vs0, vs19 // real*imag, imag*imag + xvmaddadp vs42, vs1, vs18 // real*real, imag*real + xvmaddadp vs43, vs1, vs19 // real*imag, imag*imag + xvmaddadp vs44, vs2, vs18 // real*real, imag*real + xvmaddadp vs45, vs2, vs19 // real*imag, imag*imag + xvmaddadp vs46, vs3, vs18 // real*real, imag*real + xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x4_2 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + xvmaddadp vs36, vs10, vs20 // real*real, imag*real + xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag + xvmaddadp vs38, vs11, vs20 // real*real, imag*real + xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag + + xvmaddadp vs40, vs8, vs22 // real*real, imag*real + xvmaddadp vs41, vs8, vs23 // real*imag, imag*imag + xvmaddadp vs42, vs9, vs22 // real*real, imag*real + xvmaddadp vs43, vs9, vs23 // real*imag, imag*imag + xvmaddadp vs44, vs10, vs22 // real*real, imag*real + xvmaddadp vs45, vs10, vs23 // real*imag, imag*imag + xvmaddadp vs46, vs11, vs22 // real*real, imag*real + xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x4_E2 + + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + xvmaddadp vs36, vs10, vs20 // real*real, imag*real + xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag + xvmaddadp vs38, vs11, vs20 // real*real, imag*real + xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag + + xvmaddadp vs40, vs8, vs22 // real*real, imag*real + xvmaddadp vs41, vs8, vs23 // real*imag, imag*imag + xvmaddadp vs42, vs9, vs22 // real*real, imag*real + xvmaddadp vs43, vs9, vs23 // real*imag, imag*imag + xvmaddadp vs44, vs10, vs22 // real*real, imag*real + xvmaddadp vs45, vs10, vs23 // real*imag, imag*imag + xvmaddadp vs46, vs11, vs22 // real*real, imag*real + xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x4_SUBI1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + xvmuldp vs36, vs2, vs16 // real*real, imag*real + xvmuldp vs37, vs2, vs17 // real*imag, imag*imag + xvmuldp vs38, vs3, vs16 // real*real, imag*real + xvmuldp vs39, vs3, vs17 // real*imag, imag*imag + + xvmuldp vs40, vs0, vs18 // real*real, imag*real + xvmuldp vs41, vs0, vs19 // real*imag, imag*imag + xvmuldp vs42, vs1, vs18 // real*real, imag*real + xvmuldp vs43, vs1, vs19 // real*imag, imag*imag + xvmuldp vs44, vs2, vs18 // real*real, imag*real + xvmuldp vs45, vs2, vs19 // real*imag, imag*imag + xvmuldp vs46, vs3, vs18 // real*real, imag*real + xvmuldp vs47, vs3, vs19 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x4_SUB1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + + xvmaddadp vs40, vs0, vs18 // real*real, imag*real + xvmaddadp vs41, vs0, vs19 // real*imag, imag*imag + xvmaddadp vs42, vs1, vs18 // real*real, imag*real + xvmaddadp vs43, vs1, vs19 // real*imag, imag*imag + xvmaddadp vs44, vs2, vs18 // real*real, imag*real + xvmaddadp vs45, vs2, vs19 // real*imag, imag*imag + xvmaddadp vs46, vs3, vs18 // real*real, imag*real + xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag + + +.endm + +.macro SAVE2x4 + + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + lxvd2x vs17, o16, T1 + lxvd2x vs18, o32, T1 + lxvd2x vs19, o48, T1 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs32 // realA*realB + XSFADD_R2 vs0, vs0, vs33 // imagA*imagB + + xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs32 // realA*imagB + XSFADD_I2 vs1, vs1, vs33 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs34 // realA*realB + XSFADD_R2 vs0, vs0, vs35 // imagA*imagB + + xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs34 // realA*imagB + XSFADD_I2 vs1, vs1, vs35 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs9, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs36 // realA*realB + XSFADD_R2 vs0, vs0, vs37 // imagA*imagB + + xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs36 // realA*imagB + XSFADD_I2 vs1, vs1, vs37 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs10, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs38 // realA*realB + XSFADD_R2 vs0, vs0, vs39 // imagA*imagB + + xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs38 // realA*imagB + XSFADD_I2 vs1, vs1, vs39 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs11, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + xvadddp vs10, vs10, vs18 + xvadddp vs11, vs11, vs19 + +#endif + + stxvd2x vs8, o0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + lxvd2x vs17, o16, T1 + lxvd2x vs18, o32, T1 + lxvd2x vs19, o48, T1 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs41, vs41 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs40 // realA*realB + XSFADD_R2 vs0, vs0, vs41 // imagA*imagB + + xxswapd vs40, vs40 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs41, vs41 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs40 // realA*imagB + XSFADD_I2 vs1, vs1, vs41 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs43, vs43 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs42 // realA*realB + XSFADD_R2 vs0, vs0, vs43 // imagA*imagB + + xxswapd vs42, vs42 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs43, vs43 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs42 // realA*imagB + XSFADD_I2 vs1, vs1, vs43 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs9, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs45, vs45 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs44 // realA*realB + XSFADD_R2 vs0, vs0, vs45 // imagA*imagB + + xxswapd vs44, vs44 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs45, vs45 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs44 // realA*imagB + XSFADD_I2 vs1, vs1, vs45 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs10, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs47, vs47 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs46 // realA*realB + XSFADD_R2 vs0, vs0, vs47 // imagA*imagB + + xxswapd vs46, vs46 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs47, vs47 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs46 // realA*imagB + XSFADD_I2 vs1, vs1, vs47 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs11, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + xvadddp vs10, vs10, vs18 + xvadddp vs11, vs11, vs19 + +#endif + + stxvd2x vs8, o0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + add T1, T1, LDC + addi CO, CO, 64 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=2 +**********************************************************************************************/ + +.macro LOAD2x2_1 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + + addi AO, AO, 32 + + +.endm + +.macro KERNEL2x2_I1 + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + + addi AO, AO, 32 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + lxvdsx vs22, o16, BO // load real part from B + lxvdsx vs23, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + + xvmuldp vs36, vs0, vs18 // real*real, imag*real + xvmuldp vs37, vs0, vs19 // real*imag, imag*imag + xvmuldp vs38, vs1, vs18 // real*real, imag*real + xvmuldp vs39, vs1, vs19 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x2_1 + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + + addi AO, AO, 32 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + lxvdsx vs22, o16, BO // load real part from B + lxvdsx vs23, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + + xvmaddadp vs36, vs0, vs18 // real*real, imag*real + xvmaddadp vs37, vs0, vs19 // real*imag, imag*imag + xvmaddadp vs38, vs1, vs18 // real*real, imag*real + xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x2_2 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + + addi AO, AO, 32 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + + xvmaddadp vs36, vs8, vs22 // real*real, imag*real + xvmaddadp vs37, vs8, vs23 // real*imag, imag*imag + xvmaddadp vs38, vs9, vs22 // real*real, imag*real + xvmaddadp vs39, vs9, vs23 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x2_E2 + + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + + xvmaddadp vs36, vs8, vs22 // real*real, imag*real + xvmaddadp vs37, vs8, vs23 // real*imag, imag*imag + xvmaddadp vs38, vs9, vs22 // real*real, imag*real + xvmaddadp vs39, vs9, vs23 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x2_SUBI1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + + addi AO, AO, 32 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + + xvmuldp vs36, vs0, vs18 // real*real, imag*real + xvmuldp vs37, vs0, vs19 // real*imag, imag*imag + xvmuldp vs38, vs1, vs18 // real*real, imag*real + xvmuldp vs39, vs1, vs19 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x2_SUB1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + + addi AO, AO, 32 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + + xvmaddadp vs36, vs0, vs18 // real*real, imag*real + xvmaddadp vs37, vs0, vs19 // real*imag, imag*imag + xvmaddadp vs38, vs1, vs18 // real*real, imag*real + xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag + + +.endm + +.macro SAVE2x2 + + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + lxvd2x vs17, o16, T1 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs32 // realA*realB + XSFADD_R2 vs0, vs0, vs33 // imagA*imagB + + xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs32 // realA*imagB + XSFADD_I2 vs1, vs1, vs33 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs34 // realA*realB + XSFADD_R2 vs0, vs0, vs35 // imagA*imagB + + xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs34 // realA*imagB + XSFADD_I2 vs1, vs1, vs35 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs9, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + +#endif + + stxvd2x vs8, o0, T1 + stxvd2x vs9, o16, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + lxvd2x vs17, o16, T1 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs36 // realA*realB + XSFADD_R2 vs0, vs0, vs37 // imagA*imagB + + xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs36 // realA*imagB + XSFADD_I2 vs1, vs1, vs37 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs38 // realA*realB + XSFADD_R2 vs0, vs0, vs39 // imagA*imagB + + xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs38 // realA*imagB + XSFADD_I2 vs1, vs1, vs39 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs9, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + +#endif + + stxvd2x vs8, o0, T1 + stxvd2x vs9, o16, T1 + + add T1, T1, LDC + addi CO, CO, 32 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=1 +**********************************************************************************************/ + +.macro LOAD2x1_1 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + lxvd2x vs0, o0, AO // load real,imag from A + + addi AO, AO, 16 + + +.endm + +.macro KERNEL2x1_I1 + + lxvd2x vs8, o0, AO // load real,imag from A + + addi AO, AO, 16 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + lxvdsx vs22, o16, BO // load real part from B + lxvdsx vs23, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + + xvmuldp vs34, vs0, vs18 // real*real, imag*real + xvmuldp vs35, vs0, vs19 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x1_1 + + lxvd2x vs8, o0, AO // load real,imag from A + + addi AO, AO, 16 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + lxvdsx vs22, o16, BO // load real part from B + lxvdsx vs23, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + + xvmaddadp vs34, vs0, vs18 // real*real, imag*real + xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x1_2 + + lxvd2x vs0, o0, AO // load real,imag from A + + addi AO, AO, 16 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + + xvmaddadp vs34, vs8, vs22 // real*real, imag*real + xvmaddadp vs35, vs8, vs23 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x1_E2 + + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + + xvmaddadp vs34, vs8, vs22 // real*real, imag*real + xvmaddadp vs35, vs8, vs23 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x1_SUBI1 + + lxvd2x vs0, o0, AO // load real,imag from A + + addi AO, AO, 16 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + + xvmuldp vs34, vs0, vs18 // real*real, imag*real + xvmuldp vs35, vs0, vs19 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x1_SUB1 + + lxvd2x vs0, o0, AO // load real,imag from A + + addi AO, AO, 16 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + + xvmaddadp vs34, vs0, vs18 // real*real, imag*real + xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag + + +.endm + +.macro SAVE2x1 + + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs32 // realA*realB + XSFADD_R2 vs0, vs0, vs33 // imagA*imagB + + xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs32 // realA*imagB + XSFADD_I2 vs1, vs1, vs33 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + +#endif + + stxvd2x vs8, o0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs34 // realA*realB + XSFADD_R2 vs0, vs0, vs35 // imagA*imagB + + xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs34 // realA*imagB + XSFADD_I2 vs1, vs1, vs35 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + +#endif + + stxvd2x vs8, o0, T1 + + add T1, T1, LDC + addi CO, CO, 16 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=8 +**********************************************************************************************/ + +.macro LOAD1x8_1 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvd2x vs4, o0, AO // load real,imag from A + lxvd2x vs5, o16, AO // load real,imag from A + lxvd2x vs6, o32, AO // load real,imag from A + lxvd2x vs7, o48, AO // load real,imag from A + + addi AO, AO, 64 + + +.endm + +.macro KERNEL1x8_I1 + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + lxvd2x vs10, o32, AO // load real,imag from A + lxvd2x vs11, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvd2x vs12, o0, AO // load real,imag from A + lxvd2x vs13, o16, AO // load real,imag from A + lxvd2x vs14, o32, AO // load real,imag from A + lxvd2x vs15, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + xvmuldp vs36, vs2, vs16 // real*real, imag*real + xvmuldp vs37, vs2, vs17 // real*imag, imag*imag + xvmuldp vs38, vs3, vs16 // real*real, imag*real + xvmuldp vs39, vs3, vs17 // real*imag, imag*imag + xvmuldp vs40, vs4, vs16 // real*real, imag*real + xvmuldp vs41, vs4, vs17 // real*imag, imag*imag + xvmuldp vs42, vs5, vs16 // real*real, imag*real + xvmuldp vs43, vs5, vs17 // real*imag, imag*imag + xvmuldp vs44, vs6, vs16 // real*real, imag*real + xvmuldp vs45, vs6, vs17 // real*imag, imag*imag + xvmuldp vs46, vs7, vs16 // real*real, imag*real + xvmuldp vs47, vs7, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x8_1 + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + lxvd2x vs10, o32, AO // load real,imag from A + lxvd2x vs11, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvd2x vs12, o0, AO // load real,imag from A + lxvd2x vs13, o16, AO // load real,imag from A + lxvd2x vs14, o32, AO // load real,imag from A + lxvd2x vs15, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + xvmaddadp vs40, vs4, vs16 // real*real, imag*real + xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag + xvmaddadp vs42, vs5, vs16 // real*real, imag*real + xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag + xvmaddadp vs44, vs6, vs16 // real*real, imag*real + xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag + xvmaddadp vs46, vs7, vs16 // real*real, imag*real + xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x8_2 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvd2x vs4, o0, AO // load real,imag from A + lxvd2x vs5, o16, AO // load real,imag from A + lxvd2x vs6, o32, AO // load real,imag from A + lxvd2x vs7, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + xvmaddadp vs36, vs10, vs20 // real*real, imag*real + xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag + xvmaddadp vs38, vs11, vs20 // real*real, imag*real + xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag + xvmaddadp vs40, vs12, vs20 // real*real, imag*real + xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag + xvmaddadp vs42, vs13, vs20 // real*real, imag*real + xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag + xvmaddadp vs44, vs14, vs20 // real*real, imag*real + xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag + xvmaddadp vs46, vs15, vs20 // real*real, imag*real + xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x8_E2 + + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + xvmaddadp vs36, vs10, vs20 // real*real, imag*real + xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag + xvmaddadp vs38, vs11, vs20 // real*real, imag*real + xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag + xvmaddadp vs40, vs12, vs20 // real*real, imag*real + xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag + xvmaddadp vs42, vs13, vs20 // real*real, imag*real + xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag + xvmaddadp vs44, vs14, vs20 // real*real, imag*real + xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag + xvmaddadp vs46, vs15, vs20 // real*real, imag*real + xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x8_SUBI1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvd2x vs4, o0, AO // load real,imag from A + lxvd2x vs5, o16, AO // load real,imag from A + lxvd2x vs6, o32, AO // load real,imag from A + lxvd2x vs7, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + xvmuldp vs36, vs2, vs16 // real*real, imag*real + xvmuldp vs37, vs2, vs17 // real*imag, imag*imag + xvmuldp vs38, vs3, vs16 // real*real, imag*real + xvmuldp vs39, vs3, vs17 // real*imag, imag*imag + xvmuldp vs40, vs4, vs16 // real*real, imag*real + xvmuldp vs41, vs4, vs17 // real*imag, imag*imag + xvmuldp vs42, vs5, vs16 // real*real, imag*real + xvmuldp vs43, vs5, vs17 // real*imag, imag*imag + xvmuldp vs44, vs6, vs16 // real*real, imag*real + xvmuldp vs45, vs6, vs17 // real*imag, imag*imag + xvmuldp vs46, vs7, vs16 // real*real, imag*real + xvmuldp vs47, vs7, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x8_SUB1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvd2x vs4, o0, AO // load real,imag from A + lxvd2x vs5, o16, AO // load real,imag from A + lxvd2x vs6, o32, AO // load real,imag from A + lxvd2x vs7, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + xvmaddadp vs40, vs4, vs16 // real*real, imag*real + xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag + xvmaddadp vs42, vs5, vs16 // real*real, imag*real + xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag + xvmaddadp vs44, vs6, vs16 // real*real, imag*real + xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag + xvmaddadp vs46, vs7, vs16 // real*real, imag*real + xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag + + +.endm + +.macro SAVE1x8 + + + mr T1, CO + addi T2, T1, 64 + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + lxvd2x vs17, o16, T1 + lxvd2x vs18, o32, T1 + lxvd2x vs19, o48, T1 + lxvd2x vs20, o0, T2 + lxvd2x vs21, o16, T2 + lxvd2x vs22, o32, T2 + lxvd2x vs23, o48, T2 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs32 // realA*realB + XSFADD_R2 vs0, vs0, vs33 // imagA*imagB + + xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs32 // realA*imagB + XSFADD_I2 vs1, vs1, vs33 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs34 // realA*realB + XSFADD_R2 vs0, vs0, vs35 // imagA*imagB + + xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs34 // realA*imagB + XSFADD_I2 vs1, vs1, vs35 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs9, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs36 // realA*realB + XSFADD_R2 vs0, vs0, vs37 // imagA*imagB + + xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs36 // realA*imagB + XSFADD_I2 vs1, vs1, vs37 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs10, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs38 // realA*realB + XSFADD_R2 vs0, vs0, vs39 // imagA*imagB + + xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs38 // realA*imagB + XSFADD_I2 vs1, vs1, vs39 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs11, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs41, vs41 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs40 // realA*realB + XSFADD_R2 vs0, vs0, vs41 // imagA*imagB + + xxswapd vs40, vs40 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs41, vs41 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs40 // realA*imagB + XSFADD_I2 vs1, vs1, vs41 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs12, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs43, vs43 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs42 // realA*realB + XSFADD_R2 vs0, vs0, vs43 // imagA*imagB + + xxswapd vs42, vs42 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs43, vs43 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs42 // realA*imagB + XSFADD_I2 vs1, vs1, vs43 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs13, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs45, vs45 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs44 // realA*realB + XSFADD_R2 vs0, vs0, vs45 // imagA*imagB + + xxswapd vs44, vs44 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs45, vs45 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs44 // realA*imagB + XSFADD_I2 vs1, vs1, vs45 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs14, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs47, vs47 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs46 // realA*realB + XSFADD_R2 vs0, vs0, vs47 // imagA*imagB + + xxswapd vs46, vs46 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs47, vs47 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs46 // realA*imagB + XSFADD_I2 vs1, vs1, vs47 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs15, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + xvadddp vs10, vs10, vs18 + xvadddp vs11, vs11, vs19 + xvadddp vs12, vs12, vs20 + xvadddp vs13, vs13, vs21 + xvadddp vs14, vs14, vs22 + xvadddp vs15, vs15, vs23 + +#endif + + stxvd2x vs8, o0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + stxvd2x vs12, o0, T2 + stxvd2x vs13, o16, T2 + stxvd2x vs14, o32, T2 + stxvd2x vs15, o48, T2 + + add T1, T1, LDC + add T2, T2, LDC + addi CO, CO, 128 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=4 +**********************************************************************************************/ + +.macro LOAD1x4_1 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + +.endm + +.macro KERNEL1x4_I1 + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + lxvd2x vs10, o32, AO // load real,imag from A + lxvd2x vs11, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + xvmuldp vs36, vs2, vs16 // real*real, imag*real + xvmuldp vs37, vs2, vs17 // real*imag, imag*imag + xvmuldp vs38, vs3, vs16 // real*real, imag*real + xvmuldp vs39, vs3, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x4_1 + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + lxvd2x vs10, o32, AO // load real,imag from A + lxvd2x vs11, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x4_2 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + xvmaddadp vs36, vs10, vs20 // real*real, imag*real + xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag + xvmaddadp vs38, vs11, vs20 // real*real, imag*real + xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x4_E2 + + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + xvmaddadp vs36, vs10, vs20 // real*real, imag*real + xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag + xvmaddadp vs38, vs11, vs20 // real*real, imag*real + xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x4_SUBI1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + xvmuldp vs36, vs2, vs16 // real*real, imag*real + xvmuldp vs37, vs2, vs17 // real*imag, imag*imag + xvmuldp vs38, vs3, vs16 // real*real, imag*real + xvmuldp vs39, vs3, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x4_SUB1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + + +.endm + +.macro SAVE1x4 + + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + lxvd2x vs17, o16, T1 + lxvd2x vs18, o32, T1 + lxvd2x vs19, o48, T1 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs32 // realA*realB + XSFADD_R2 vs0, vs0, vs33 // imagA*imagB + + xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs32 // realA*imagB + XSFADD_I2 vs1, vs1, vs33 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs34 // realA*realB + XSFADD_R2 vs0, vs0, vs35 // imagA*imagB + + xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs34 // realA*imagB + XSFADD_I2 vs1, vs1, vs35 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs9, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs36 // realA*realB + XSFADD_R2 vs0, vs0, vs37 // imagA*imagB + + xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs36 // realA*imagB + XSFADD_I2 vs1, vs1, vs37 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs10, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs38 // realA*realB + XSFADD_R2 vs0, vs0, vs39 // imagA*imagB + + xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs38 // realA*imagB + XSFADD_I2 vs1, vs1, vs39 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs11, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + xvadddp vs10, vs10, vs18 + xvadddp vs11, vs11, vs19 + +#endif + + stxvd2x vs8, o0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + add T1, T1, LDC + addi CO, CO, 64 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=2 +**********************************************************************************************/ + +.macro LOAD1x2_1 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + + addi AO, AO, 32 + + +.endm + +.macro KERNEL1x2_I1 + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + + addi AO, AO, 32 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x2_1 + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + + addi AO, AO, 32 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x2_2 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + + addi AO, AO, 32 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x2_E2 + + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x2_SUBI1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + + addi AO, AO, 32 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x2_SUB1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + + addi AO, AO, 32 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + + +.endm + +.macro SAVE1x2 + + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + lxvd2x vs17, o16, T1 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs32 // realA*realB + XSFADD_R2 vs0, vs0, vs33 // imagA*imagB + + xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs32 // realA*imagB + XSFADD_I2 vs1, vs1, vs33 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs34 // realA*realB + XSFADD_R2 vs0, vs0, vs35 // imagA*imagB + + xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs34 // realA*imagB + XSFADD_I2 vs1, vs1, vs35 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs9, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + +#endif + + stxvd2x vs8, o0, T1 + stxvd2x vs9, o16, T1 + + add T1, T1, LDC + addi CO, CO, 32 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=1 +**********************************************************************************************/ + +.macro LOAD1x1_1 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + lxvd2x vs0, o0, AO // load real,imag from A + + addi AO, AO, 16 + + +.endm + +.macro KERNEL1x1_I1 + + lxvd2x vs8, o0, AO // load real,imag from A + + addi AO, AO, 16 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x1_1 + + lxvd2x vs8, o0, AO // load real,imag from A + + addi AO, AO, 16 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x1_2 + + lxvd2x vs0, o0, AO // load real,imag from A + + addi AO, AO, 16 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x1_E2 + + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x1_SUBI1 + + lxvd2x vs0, o0, AO // load real,imag from A + + addi AO, AO, 16 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x1_SUB1 + + lxvd2x vs0, o0, AO // load real,imag from A + + addi AO, AO, 16 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + + +.endm + +.macro SAVE1x1 + + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs32 // realA*realB + XSFADD_R2 vs0, vs0, vs33 // imagA*imagB + + xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs32 // realA*imagB + XSFADD_I2 vs1, vs1, vs33 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + +#endif + + stxvd2x vs8, o0, T1 + + add T1, T1, LDC + addi CO, CO, 16 + +.endm + diff --git a/kernel/power/zgemv_n.S b/kernel/power/zgemv_n.S index f934399..23e0177 100644 --- a/kernel/power/zgemv_n.S +++ b/kernel/power/zgemv_n.S @@ -170,11 +170,6 @@ #define PREFETCHSIZE_C 24 #endif -#ifdef POWER8 -#define PREFETCHSIZE_A 24 -#define PREFETCHSIZE_C 24 -#endif - #ifndef XCONJ #define FMADDR FMADD #define FMSUBR FNMSUB diff --git a/kernel/power/zgemv_t.S b/kernel/power/zgemv_t.S index 2b45014..c0bad31 100644 --- a/kernel/power/zgemv_t.S +++ b/kernel/power/zgemv_t.S @@ -144,11 +144,6 @@ #define PREFETCHSIZE_C 8 #endif -#ifdef POWER8 -#define PREFETCHSIZE_A 24 -#define PREFETCHSIZE_C 8 -#endif - #if !(defined(CONJ) && defined(XCONJ)) #define FMADDR FMADD #define FMSUBR FNMSUB diff --git a/kernel/power/zsymv_L.S b/kernel/power/zsymv_L.S index 394c030..b348e32 100644 --- a/kernel/power/zsymv_L.S +++ b/kernel/power/zsymv_L.S @@ -169,11 +169,7 @@ #define PREFETCHSIZE_A 112 #endif -#ifdef POWER8 -#define PREFETCHSIZE_A 112 -#endif - -#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970) || defined(POWER8) +#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970) #define NOP1 #define NOP2 #else diff --git a/kernel/power/zsymv_U.S b/kernel/power/zsymv_U.S index a061cd7..b631cbe 100644 --- a/kernel/power/zsymv_U.S +++ b/kernel/power/zsymv_U.S @@ -166,11 +166,7 @@ #define PREFETCHSIZE_A 112 #endif -#ifdef POWER8 -#define PREFETCHSIZE_A 112 -#endif - -#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970) || defined(POWER8) +#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970) #define NOP1 #define NOP2 #else diff --git a/kernel/power/ztrmm_kernel_8x2_power8.S b/kernel/power/ztrmm_kernel_8x2_power8.S new file mode 100644 index 0000000..dbbc8f9 --- /dev/null +++ b/kernel/power/ztrmm_kernel_8x2_power8.S @@ -0,0 +1,342 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA_R_SP 296(SP) +#define ALPHA_I_SP 304(SP) +#define FZERO 312(SP) +#else +#define STACKSIZE 256 +#define ALPHA_R_SP 224(SP) +#define ALPHA_I_SP 232(SP) +#define FZERO 240(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r10 +#define B r6 +#define C r7 +#define LDC r8 +#define OFFSET r9 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#define o0 0 +#define alpha_r vs30 +#define alpha_i vs31 + +#define KKK r13 +#define K1 r14 +#define L r15 +#define ALPHA r16 +#define o24 r17 +#define T2 r19 +#define KK r20 +#define o8 r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define o16 r27 +#define o32 r28 +#define o48 r29 + +#define PRE r30 +#define T1 r31 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + std r13, 288(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) + stw r19, 192(SP) + stw r18, 196(SP) + stw r17, 200(SP) + stw r16, 204(SP) + stw r15, 208(SP) + stw r14, 212(SP) + stw r13, 216(SP) +#endif + + stfd f1, ALPHA_R_SP + stfd f2, ALPHA_I_SP + stw r0, FZERO + +#ifdef linux +#ifdef __64BIT__ + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz B, FRAMESLOT(0) + STACKSIZE(SP) + lwz C, FRAMESLOT(1) + STACKSIZE(SP) + lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) +#else + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) +#endif +#endif +#endif + +#ifdef TRMMKERNEL +#if defined(linux) && defined(__64BIT__) + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP) +#else + lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) +#endif +#endif +#endif +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, OFFSET +#endif +#endif + +#include "zgemm_macros_8x2_power8.S" + + cmpwi cr0, M, 0 + ble L999 + cmpwi cr0, N, 0 + ble L999 + cmpwi cr0, K, 0 + ble L999 + + slwi LDC, LDC, ZBASE_SHIFT + li PRE, 256 + li o8 , 8 + li o16 , 16 + li o24 , 24 + li o32 , 32 + li o48 , 48 + +#ifdef __64BIT__ + addi ALPHA, SP, 296 +#else + addi ALPHA, SP, 224 +#endif + + lxsdx alpha_r, 0, ALPHA + lxsdx alpha_i, o8, ALPHA + + .align 4 + +#include "ztrmm_logic_8x2_power8.S" + +L999: + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + ld r13, 288(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) + lwz r19, 192(SP) + lwz r18, 196(SP) + lwz r17, 200(SP) + lwz r16, 204(SP) + lwz r15, 208(SP) + lwz r14, 212(SP) + lwz r13, 216(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/ztrmm_logic_8x2_power8.S b/kernel/power/ztrmm_logic_8x2_power8.S new file mode 100644 index 0000000..e250dfa --- /dev/null +++ b/kernel/power/ztrmm_logic_8x2_power8.S @@ -0,0 +1,1201 @@ + srawi. J, N, 1 + ble ZTRMM_L2_END + +ZTRMM_L2_BEGIN: + + mr CO, C + mr AO, A + slwi T1, LDC , 1 + add C, C, T1 + +#if defined(LEFT) + mr KK, OFFSET // OFFSET -> KK +#endif + + srawi. I, M, 3 + ble ZTRMM_L2x8_END + +ZTRMM_L2x8_BEGIN: + + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 5 // Number of values in B shifted + slwi T2, KK, 7 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble ZTRMM_L2x8_SUB0 + cmpwi cr0, L, 1 + ble ZTRMM_L2x8_SUB4 + +ZTRMM_L2x8_LOOP_START: + + dcbt AO, PRE + LOAD2x8_1 + dcbt AO, PRE + KERNEL2x8_I1 + dcbt AO, PRE + KERNEL2x8_2 + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + + addic. L, L, -2 + ble ZTRMM_L2x8_LOOP_END + + .align 5 + +ZTRMM_L2x8_LOOP: + + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + + addic. L, L, -1 + bgt ZTRMM_L2x8_LOOP + +ZTRMM_L2x8_LOOP_END: + + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + dcbt AO, PRE + KERNEL2x8_1 + KERNEL2x8_E2 + + b ZTRMM_L2x8_SUB1 + +ZTRMM_L2x8_SUB4: + + dcbt AO, PRE + KERNEL2x8_SUBI1 + dcbt AO, PRE + KERNEL2x8_SUB1 + dcbt AO, PRE + KERNEL2x8_SUB1 + dcbt AO, PRE + KERNEL2x8_SUB1 + + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + + b ZTRMM_L2x8_SUB1 + +ZTRMM_L2x8_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL2x8_SUBI1 + + addic. L, L, -1 + ble ZTRMM_L2x8_SAVE + b ZTRMM_L2x8_SUB2 + +ZTRMM_L2x8_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble ZTRMM_L2x8_SAVE + +ZTRMM_L2x8_SUB2: + + KERNEL2x8_SUB1 + + addic. L, L, -1 + bgt ZTRMM_L2x8_SUB2 + +ZTRMM_L2x8_SAVE: + + SAVE2x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 7 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 8 // KK += Number of values in A +#endif + + + addic. I, I, -1 + bgt ZTRMM_L2x8_BEGIN + +ZTRMM_L2x8_END: + +ZTRMM_L2x4_BEGIN: + andi. T2, M, 7 + ble ZTRMM_L2x1_END + + andi. T1, M, 4 + ble ZTRMM_L2x4_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 5 // Number of values in B shifted + slwi T2, KK, 6 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble ZTRMM_L2x4_SUB0 + cmpwi cr0, L, 1 + ble ZTRMM_L2x4_SUB4 + +ZTRMM_L2x4_LOOP_START: + + LOAD2x4_1 + KERNEL2x4_I1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -2 + ble ZTRMM_L2x4_LOOP_END + + .align 5 + +ZTRMM_L2x4_LOOP: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -1 + bgt ZTRMM_L2x4_LOOP + +ZTRMM_L2x4_LOOP_END: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_E2 + + b ZTRMM_L2x4_SUB1 + +ZTRMM_L2x4_SUB4: + + KERNEL2x4_SUBI1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + b ZTRMM_L2x4_SUB1 + +ZTRMM_L2x4_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL2x4_SUBI1 + + addic. L, L, -1 + ble ZTRMM_L2x4_SAVE + b ZTRMM_L2x4_SUB2 + +ZTRMM_L2x4_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble ZTRMM_L2x4_SAVE + +ZTRMM_L2x4_SUB2: + + KERNEL2x4_SUB1 + + addic. L, L, -1 + bgt ZTRMM_L2x4_SUB2 + +ZTRMM_L2x4_SAVE: + + SAVE2x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 4 // KK += Number of values in A +#endif + + +ZTRMM_L2x4_END: + +ZTRMM_L2x2_BEGIN: + + andi. T1, M, 2 + ble ZTRMM_L2x2_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 5 // Number of values in B shifted + slwi T2, KK, 5 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble ZTRMM_L2x2_SUB0 + cmpwi cr0, L, 1 + ble ZTRMM_L2x2_SUB4 + +ZTRMM_L2x2_LOOP_START: + + LOAD2x2_1 + KERNEL2x2_I1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -2 + ble ZTRMM_L2x2_LOOP_END + + .align 5 + +ZTRMM_L2x2_LOOP: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -1 + bgt ZTRMM_L2x2_LOOP + +ZTRMM_L2x2_LOOP_END: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_E2 + + b ZTRMM_L2x2_SUB1 + +ZTRMM_L2x2_SUB4: + + KERNEL2x2_SUBI1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + b ZTRMM_L2x2_SUB1 + +ZTRMM_L2x2_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL2x2_SUBI1 + + addic. L, L, -1 + ble ZTRMM_L2x2_SAVE + b ZTRMM_L2x2_SUB2 + +ZTRMM_L2x2_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble ZTRMM_L2x2_SAVE + +ZTRMM_L2x2_SUB2: + + KERNEL2x2_SUB1 + + addic. L, L, -1 + bgt ZTRMM_L2x2_SUB2 + +ZTRMM_L2x2_SAVE: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 2 // KK += Number of values in A +#endif + + +ZTRMM_L2x2_END: + +ZTRMM_L2x1_BEGIN: + + andi. T1, M, 1 + ble ZTRMM_L2x1_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 5 // Number of values in B shifted + slwi T2, KK, 4 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble ZTRMM_L2x1_SUB0 + cmpwi cr0, L, 1 + ble ZTRMM_L2x1_SUB4 + +ZTRMM_L2x1_LOOP_START: + + LOAD2x1_1 + KERNEL2x1_I1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -2 + ble ZTRMM_L2x1_LOOP_END + + .align 5 + +ZTRMM_L2x1_LOOP: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -1 + bgt ZTRMM_L2x1_LOOP + +ZTRMM_L2x1_LOOP_END: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_E2 + + b ZTRMM_L2x1_SUB1 + +ZTRMM_L2x1_SUB4: + + KERNEL2x1_SUBI1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + b ZTRMM_L2x1_SUB1 + +ZTRMM_L2x1_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL2x1_SUBI1 + + addic. L, L, -1 + ble ZTRMM_L2x1_SAVE + b ZTRMM_L2x1_SUB2 + +ZTRMM_L2x1_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble ZTRMM_L2x1_SAVE + +ZTRMM_L2x1_SUB2: + + KERNEL2x1_SUB1 + + addic. L, L, -1 + bgt ZTRMM_L2x1_SUB2 + +ZTRMM_L2x1_SAVE: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 1 // KK += Number of values in A +#endif + + +ZTRMM_L2x1_END: + + slwi T1, K, 5 + add B, B, T1 + +#if !defined(LEFT) + addi KK, KK, 2 // KK += Number of values in B +#endif + + + addic. J, J, -1 + bgt ZTRMM_L2_BEGIN + + andi. T2, N, 1 + ble L999 + +ZTRMM_L2_END: + + b ZTRMM_L1_BEGIN + +L999_H1: + + b L999 + +ZTRMM_L1_BEGIN: + + andi. T1, N, 1 + ble ZTRMM_L1_END + mr CO, C + mr AO, A + +#if defined(LEFT) + mr KK, OFFSET // OFFSET -> KK +#endif + + srawi. I, M, 3 + ble ZTRMM_L1x8_END + +ZTRMM_L1x8_BEGIN: + + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 4 // Number of values in B shifted + slwi T2, KK, 7 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble ZTRMM_L1x8_SUB0 + cmpwi cr0, L, 1 + ble ZTRMM_L1x8_SUB4 + +ZTRMM_L1x8_LOOP_START: + + dcbt AO, PRE + LOAD1x8_1 + dcbt AO, PRE + KERNEL1x8_I1 + dcbt AO, PRE + KERNEL1x8_2 + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + + addic. L, L, -2 + ble ZTRMM_L1x8_LOOP_END + + .align 5 + +ZTRMM_L1x8_LOOP: + + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + + addic. L, L, -1 + bgt ZTRMM_L1x8_LOOP + +ZTRMM_L1x8_LOOP_END: + + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + dcbt AO, PRE + KERNEL1x8_1 + KERNEL1x8_E2 + + b ZTRMM_L1x8_SUB1 + +ZTRMM_L1x8_SUB4: + + dcbt AO, PRE + KERNEL1x8_SUBI1 + dcbt AO, PRE + KERNEL1x8_SUB1 + dcbt AO, PRE + KERNEL1x8_SUB1 + dcbt AO, PRE + KERNEL1x8_SUB1 + + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + + b ZTRMM_L1x8_SUB1 + +ZTRMM_L1x8_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL1x8_SUBI1 + + addic. L, L, -1 + ble ZTRMM_L1x8_SAVE + b ZTRMM_L1x8_SUB2 + +ZTRMM_L1x8_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble ZTRMM_L1x8_SAVE + +ZTRMM_L1x8_SUB2: + + KERNEL1x8_SUB1 + + addic. L, L, -1 + bgt ZTRMM_L1x8_SUB2 + +ZTRMM_L1x8_SAVE: + + SAVE1x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 7 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 8 // KK += Number of values in A +#endif + + + addic. I, I, -1 + bgt ZTRMM_L1x8_BEGIN + +ZTRMM_L1x8_END: + +ZTRMM_L1x4_BEGIN: + andi. T2, M, 7 + ble ZTRMM_L1x1_END + + andi. T1, M, 4 + ble ZTRMM_L1x4_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 4 // Number of values in B shifted + slwi T2, KK, 6 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble ZTRMM_L1x4_SUB0 + cmpwi cr0, L, 1 + ble ZTRMM_L1x4_SUB4 + +ZTRMM_L1x4_LOOP_START: + + LOAD1x4_1 + KERNEL1x4_I1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -2 + ble ZTRMM_L1x4_LOOP_END + + .align 5 + +ZTRMM_L1x4_LOOP: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -1 + bgt ZTRMM_L1x4_LOOP + +ZTRMM_L1x4_LOOP_END: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_E2 + + b ZTRMM_L1x4_SUB1 + +ZTRMM_L1x4_SUB4: + + KERNEL1x4_SUBI1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + b ZTRMM_L1x4_SUB1 + +ZTRMM_L1x4_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL1x4_SUBI1 + + addic. L, L, -1 + ble ZTRMM_L1x4_SAVE + b ZTRMM_L1x4_SUB2 + +ZTRMM_L1x4_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble ZTRMM_L1x4_SAVE + +ZTRMM_L1x4_SUB2: + + KERNEL1x4_SUB1 + + addic. L, L, -1 + bgt ZTRMM_L1x4_SUB2 + +ZTRMM_L1x4_SAVE: + + SAVE1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 4 // KK += Number of values in A +#endif + + +ZTRMM_L1x4_END: + +ZTRMM_L1x2_BEGIN: + + andi. T1, M, 2 + ble ZTRMM_L1x2_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 4 // Number of values in B shifted + slwi T2, KK, 5 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble ZTRMM_L1x2_SUB0 + cmpwi cr0, L, 1 + ble ZTRMM_L1x2_SUB4 + +ZTRMM_L1x2_LOOP_START: + + LOAD1x2_1 + KERNEL1x2_I1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -2 + ble ZTRMM_L1x2_LOOP_END + + .align 5 + +ZTRMM_L1x2_LOOP: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -1 + bgt ZTRMM_L1x2_LOOP + +ZTRMM_L1x2_LOOP_END: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_E2 + + b ZTRMM_L1x2_SUB1 + +ZTRMM_L1x2_SUB4: + + KERNEL1x2_SUBI1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + b ZTRMM_L1x2_SUB1 + +ZTRMM_L1x2_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL1x2_SUBI1 + + addic. L, L, -1 + ble ZTRMM_L1x2_SAVE + b ZTRMM_L1x2_SUB2 + +ZTRMM_L1x2_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble ZTRMM_L1x2_SAVE + +ZTRMM_L1x2_SUB2: + + KERNEL1x2_SUB1 + + addic. L, L, -1 + bgt ZTRMM_L1x2_SUB2 + +ZTRMM_L1x2_SAVE: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 2 // KK += Number of values in A +#endif + + +ZTRMM_L1x2_END: + +ZTRMM_L1x1_BEGIN: + + andi. T1, M, 1 + ble ZTRMM_L1x1_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 4 // Number of values in B shifted + slwi T2, KK, 4 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble ZTRMM_L1x1_SUB0 + cmpwi cr0, L, 1 + ble ZTRMM_L1x1_SUB4 + +ZTRMM_L1x1_LOOP_START: + + LOAD1x1_1 + KERNEL1x1_I1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -2 + ble ZTRMM_L1x1_LOOP_END + + .align 5 + +ZTRMM_L1x1_LOOP: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -1 + bgt ZTRMM_L1x1_LOOP + +ZTRMM_L1x1_LOOP_END: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_E2 + + b ZTRMM_L1x1_SUB1 + +ZTRMM_L1x1_SUB4: + + KERNEL1x1_SUBI1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + b ZTRMM_L1x1_SUB1 + +ZTRMM_L1x1_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL1x1_SUBI1 + + addic. L, L, -1 + ble ZTRMM_L1x1_SAVE + b ZTRMM_L1x1_SUB2 + +ZTRMM_L1x1_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble ZTRMM_L1x1_SAVE + +ZTRMM_L1x1_SUB2: + + KERNEL1x1_SUB1 + + addic. L, L, -1 + bgt ZTRMM_L1x1_SUB2 + +ZTRMM_L1x1_SAVE: + + SAVE1x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 1 // KK += Number of values in A +#endif + + +ZTRMM_L1x1_END: + +#if !defined(LEFT) + addi KK, KK, 1 // KK += Number of values in B +#endif + + +ZTRMM_L1_END: diff --git a/param.h b/param.h index c46a1e9..e7dca2c 100644 --- a/param.h +++ b/param.h @@ -1962,35 +1962,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(POWER8) #define SNUMOPT 4 -#define DNUMOPT 4 +#define DNUMOPT 8 #define GEMM_DEFAULT_OFFSET_A 384 #define GEMM_DEFAULT_OFFSET_B 1024 #define GEMM_DEFAULT_ALIGN 0x03fffUL -#define SGEMM_DEFAULT_UNROLL_M 4 -#define SGEMM_DEFAULT_UNROLL_N 4 -#define DGEMM_DEFAULT_UNROLL_M 4 +#define SGEMM_DEFAULT_UNROLL_M 2 +#define SGEMM_DEFAULT_UNROLL_N 2 +#define DGEMM_DEFAULT_UNROLL_M 16 #define DGEMM_DEFAULT_UNROLL_N 4 #define CGEMM_DEFAULT_UNROLL_M 2 -#define CGEMM_DEFAULT_UNROLL_N 4 -#define ZGEMM_DEFAULT_UNROLL_M 2 -#define ZGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_M 8 +#define ZGEMM_DEFAULT_UNROLL_N 2 #define SGEMM_DEFAULT_P 992 #define DGEMM_DEFAULT_P 480 #define CGEMM_DEFAULT_P 488 -#define ZGEMM_DEFAULT_P 248 +#define ZGEMM_DEFAULT_P 240 #define SGEMM_DEFAULT_Q 504 -#define DGEMM_DEFAULT_Q 504 +#define DGEMM_DEFAULT_Q 720 #define CGEMM_DEFAULT_Q 400 -#define ZGEMM_DEFAULT_Q 400 +#define ZGEMM_DEFAULT_Q 360 + +#define DGEMM_DEFAULT_R 14400 +#define ZGEMM_DEFAULT_R 7200 #define SYMV_P 8 #endif + #if defined(SPARC) && defined(V7) #define SNUMOPT 4