#CGEMM_BETA = ../generic/zgemm_beta.c
#ZGEMM_BETA = ../generic/zgemm_beta.c
-STRMMKERNEL = sgemm_kernel_power9.S
-DTRMMKERNEL = dgemm_kernel_power9.S
-CTRMMKERNEL = cgemm_kernel_power9.S
+STRMMKERNEL = sgemm_kernel_power10.c
+DTRMMKERNEL = dgemm_kernel_power10.c
+CTRMMKERNEL = cgemm_kernel_power10.S
ZTRMMKERNEL = zgemm_kernel_power9.S
-SGEMMKERNEL = sgemm_kernel_power9.S
+SGEMMKERNEL = sgemm_kernel_power10.c
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
SGEMMITCOPY = sgemm_tcopy_16_power8.S
SGEMMONCOPY = ../generic/gemm_ncopy_8.c
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
-DGEMMKERNEL = dgemm_kernel_power9.S
+DGEMMKERNEL = dgemm_kernel_power10.c
DGEMMINCOPY = ../generic/gemm_ncopy_16.c
DGEMMITCOPY = dgemm_tcopy_16_power8.S
DGEMMONCOPY = dgemm_ncopy_4_power8.S
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
-CGEMMKERNEL = cgemm_kernel_power9.S
+CGEMMKERNEL = cgemm_kernel_power10.S
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
CGEMMITCOPY = ../generic/zgemm_tcopy_8.c
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
--- /dev/null
+/***************************************************************************
+Copyright (c) 2013-2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+
+#define LOAD ld
+#define STACKSIZE (512 )
+#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */
+#define M r3
+#define N r4
+#define K r5
+
+
+#define A r8
+#define B r9
+#define C r10
+#define LDC r6
+#define OFFSET r7
+
+
+#define alpha_r vs51
+#define alpha_i vs55
+#define save_permute_1 vs59
+#define permute_mask vs63
+#define o0 0
+
+
+#define T1 r11
+#define T2 r12
+#define T3 r14
+#define T4 r15
+#define T5 r16
+#define T6 r17
+#define L r18
+#define T7 r19
+#define T8 r20
+#define TEMP_REG r21
+#define I r22
+#define J r23
+#define AO r24
+#define BO r25
+#define CO r26
+#define T9 r27
+#define T10 r28
+#define PRE r29
+
+#define T12 r30
+#define T13 r31
+
+#include "cgemm_macros_power10.S"
+
+.equ perm_const1, 0x0405060700010203
+.equ perm_const2, 0x0c0d0e0f08090a0b
+.equ save_permute_12, 0x0c0d0e0f1c1d1e1f
+.equ save_permute_11, 0x0405060714151617
+
+
+
+#ifndef NEEDPARAM
+
+ PROLOGUE
+ PROFCODE
+
+
+ addi SP, SP, -STACKSIZE
+ mflr r0
+
+
+ stfd f14, 0(SP)
+ stfd f15, 8(SP)
+ stfd f16, 16(SP)
+ stfd f17, 24(SP)
+
+ stfd f18, 32(SP)
+ stfd f19, 40(SP)
+ stfd f20, 48(SP)
+ stfd f21, 56(SP)
+
+ stfd f22, 64(SP)
+ stfd f23, 72(SP)
+ stfd f24, 80(SP)
+ stfd f25, 88(SP)
+
+ stfd f26, 96(SP)
+ stfd f27, 104(SP)
+ stfd f28, 112(SP)
+ stfd f29, 120(SP)
+
+ stfd f30, 128(SP)
+ stfd f31, 136(SP)
+
+
+ std r31, 144(SP)
+ std r30, 152(SP)
+ std r29, 160(SP)
+ std r28, 168(SP)
+ std r27, 176(SP)
+ std r26, 184(SP)
+ std r25, 192(SP)
+ std r24, 200(SP)
+ std r23, 208(SP)
+ std r22, 216(SP)
+ std r21, 224(SP)
+ std r20, 232(SP)
+ std r19, 240(SP)
+ std r18, 248(SP)
+ std r17, 256(SP)
+ std r16, 264(SP)
+ std r15, 272(SP)
+ std r14, 280(SP)
+
+
+ stxv vs52, 288(SP)
+ stxv vs53, 304(SP)
+ stxv vs54, 320(SP)
+ stxv vs55, 336(SP)
+ stxv vs56, 352(SP)
+ stxv vs57, 368(SP)
+ stxv vs58, 384(SP)
+ stxv vs59, 400(SP)
+ stxv vs60, 416(SP)
+ stxv vs61, 432(SP)
+ stxv vs62, 448(SP)
+ stxv vs63, 464(SP)
+ std r0, FLINK_SAVE(SP)
+
+
+
+ ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
+
+
+
+#ifdef TRMMKERNEL
+ ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
+#endif
+ slwi LDC, LDC, ZBASE_SHIFT
+
+
+
+ /*alpha is stored in f1. convert to single and splat*/
+ xscvdpspn alpha_r,vs1
+ xscvdpspn alpha_i,vs2
+ xxspltw alpha_r,alpha_r,0
+ xxspltw alpha_i,alpha_i,0
+/*load reverse permute mask for big endian
+ uint128 = 0xc0d0e0f08090a0b0405060700010203
+*/
+
+ lis T2, perm_const2@highest
+ lis T1, perm_const1@highest
+ lis T3, save_permute_12@highest
+ lis T4, save_permute_11@highest
+
+
+ ori T2, T2, perm_const2@higher
+ ori T1, T1, perm_const1@higher
+ ori T3, T3, save_permute_12@higher
+ ori T4, T4, save_permute_11@higher
+
+
+ rldicr T2, T2, 32, 31
+ rldicr T1, T1, 32, 31
+ rldicr T3, T3, 32, 31
+ rldicr T4, T4, 32, 31
+
+ oris T2, T2, perm_const2@h
+ oris T1, T1, perm_const1@h
+ oris T3, T3, save_permute_12@h
+ oris T4, T4, save_permute_11@h
+
+
+ ori T2, T2, perm_const2@l
+ ori T1, T1, perm_const1@l
+ ori T3, T3, save_permute_12@l
+ ori T4, T4, save_permute_11@l
+
+
+ li r0,0
+ li PRE,512
+
+#if defined(CC) || defined(CR) || defined(RC) || defined(RR)
+/*negate for this case as we will use addition -1*(a+b) */
+ xvnegsp alpha_r,alpha_r
+ xvnegsp alpha_i,alpha_i
+#endif
+
+ mtvsrdd permute_mask,T2,T1
+ mtvsrdd save_permute_1,T3,T4
+
+ /*mask is reverse permute so we have to make it inner permute */
+ xxpermdi permute_mask, permute_mask, permute_mask,2
+
+#include "cgemm_logic_power10.S"
+
+.L999:
+ lfd f14, 0(SP)
+ lfd f15, 8(SP)
+ lfd f16, 16(SP)
+ lfd f17, 24(SP)
+
+ lfd f18, 32(SP)
+ lfd f19, 40(SP)
+ lfd f20, 48(SP)
+ lfd f21, 56(SP)
+
+ lfd f22, 64(SP)
+ lfd f23, 72(SP)
+ lfd f24, 80(SP)
+ lfd f25, 88(SP)
+
+ lfd f26, 96(SP)
+ lfd f27, 104(SP)
+ lfd f28, 112(SP)
+ lfd f29, 120(SP)
+
+ lfd f30, 128(SP)
+ lfd f31, 136(SP)
+
+ ld r31, 144(SP)
+ ld r30, 152(SP)
+ ld r29, 160(SP)
+ ld r28, 168(SP)
+ ld r27, 176(SP)
+ ld r26, 184(SP)
+ ld r25, 192(SP)
+ ld r24, 200(SP)
+ ld r23, 208(SP)
+ ld r22, 216(SP)
+ ld r21, 224(SP)
+ ld r20, 232(SP)
+ ld r19, 240(SP)
+ ld r18, 248(SP)
+ ld r17, 256(SP)
+ ld r16, 264(SP)
+ ld r15, 272(SP)
+ ld r14, 280(SP)
+
+ ld r0, FLINK_SAVE(SP)
+
+ lxv vs52, 288(SP)
+ lxv vs53, 304(SP)
+ lxv vs54, 320(SP)
+ lxv vs55, 336(SP)
+ lxv vs56, 352(SP)
+ lxv vs57, 368(SP)
+ lxv vs58, 384(SP)
+ lxv vs59, 400(SP)
+ mtlr r0
+ lxv vs60, 416(SP)
+ lxv vs61, 432(SP)
+ lxv vs62, 448(SP)
+ lxv vs63, 464(SP)
+
+ addi SP, SP, STACKSIZE
+ blr
+
+
+ EPILOGUE
+#endif
--- /dev/null
+/***************************************************************************
+Copyright (c) 2013-2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define MY_ALIGN .align 3
+b CGEMM_L4
+/* MINI SUBROUTINES */
+/* 4x8 MAIN 128x+2 LOOP */
+
+
+CGEMM_L4x8_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD4x8_2
+ MY_ALIGN
+CGEMM_L4x8_LOOP:
+/*----------------------------------------*/
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL4x8_L2 128,64,0,0
+CGEMM_L4x8_K128:
+/*----------------------------------------*/
+ KERNEL4x8_L2 128,64,1,0
+ dcbt AO, T2
+ KERNEL4x8_L2 128,64,2,0
+ KERNEL4x8_L2 128,64,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL4x8_L2 128,64,4,0
+ KERNEL4x8_L2 128,64,5,0
+ dcbt AO, T4
+ KERNEL4x8_L2 128,64,6,0
+ KERNEL4x8_L2 128,64,7,0
+ dcbt AO, T5
+ dcbt BO, T3
+ KERNEL4x8_L2 128,64,8,0
+ KERNEL4x8_L2 128,64,9,0
+ KERNEL4x8_L2 128,64,10,0
+ KERNEL4x8_L2 128,64,11,0
+ dcbt BO, T4
+ KERNEL4x8_L2 128,64,12,0
+ KERNEL4x8_L2 128,64,13,0
+ KERNEL4x8_L2 128,64,14,0
+ KERNEL4x8_L2 128,64,15,0
+ KERNEL4x8_L2 128,64,16,0
+ KERNEL4x8_L2 128,64,17,0
+ KERNEL4x8_L2 128,64,18,0
+ KERNEL4x8_L2 128,64,19,0
+ KERNEL4x8_L2 128,64,20,0
+ KERNEL4x8_L2 128,64,21,0
+ KERNEL4x8_L2 128,64,22,0
+ KERNEL4x8_L2 128,64,23,0
+ KERNEL4x8_L2 128,64,24,0
+ KERNEL4x8_L2 128,64,25,0
+ KERNEL4x8_L2 128,64,26,0
+ KERNEL4x8_L2 128,64,27,0
+ KERNEL4x8_L2 128,64,28,0
+ KERNEL4x8_L2 128,64,29,0
+ KERNEL4x8_L2 128,64,30,0
+ KERNEL4x8_L2 128,64,31,0
+ KERNEL4x8_L2 128,64,32,0
+ KERNEL4x8_L2 128,64,33,0
+ KERNEL4x8_L2 128,64,34,0
+ KERNEL4x8_L2 128,64,35,0
+ KERNEL4x8_L2 128,64,36,0
+ KERNEL4x8_L2 128,64,37,0
+ KERNEL4x8_L2 128,64,38,0
+ KERNEL4x8_L2 128,64,39,0
+ KERNEL4x8_L2 128,64,40,0
+ KERNEL4x8_L2 128,64,41,0
+ KERNEL4x8_L2 128,64,42,0
+ KERNEL4x8_L2 128,64,43,0
+ KERNEL4x8_L2 128,64,44,0
+ KERNEL4x8_L2 128,64,45,0
+ KERNEL4x8_L2 128,64,46,0
+ KERNEL4x8_L2 128,64,47,0
+ KERNEL4x8_L2 128,64,48,0
+ KERNEL4x8_L2 128,64,49,0
+ KERNEL4x8_L2 128,64,50,0
+ KERNEL4x8_L2 128,64,51,0
+ KERNEL4x8_L2 128,64,52,0
+ KERNEL4x8_L2 128,64,53,0
+ KERNEL4x8_L2 128,64,54,0
+ KERNEL4x8_L2 128,64,55,0
+ KERNEL4x8_L2 128,64,56,0
+ KERNEL4x8_L2 128,64,57,0
+ KERNEL4x8_L2 128,64,58,0
+ KERNEL4x8_L2 128,64,59,0
+ KERNEL4x8_L2 128,64,60,0
+ KERNEL4x8_L2 128,64,61,0
+ KERNEL4x8_L2 128,64,62,0
+ KERNEL4x8_L2 128,64,63,1
+ bdnz CGEMM_L4x8_LOOP
+ MY_ALIGN
+CGEMM_L4x8_LOOP_END:
+/*----------------------------------------*/
+ END4x8_2
+ blr
+ MY_ALIGN
+
+
+CGEMM_4x8_L64_SUB:
+/*----------------------------------------*/
+ LOAD4x8_2
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL4x8_L2 128,64,0,0
+ KERNEL4x8_L2 128,64,1,0
+ dcbt AO, T2
+ KERNEL4x8_L2 128,64,2,0
+ KERNEL4x8_L2 128,64,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL4x8_L2 128,64,4,0
+ KERNEL4x8_L2 128,64,5,0
+ dcbt AO, T4
+ KERNEL4x8_L2 128,64,6,0
+ KERNEL4x8_L2 128,64,7,0
+ dcbt AO, T5
+ dcbt BO, T3
+ KERNEL4x8_L2 128,64,8,0
+ KERNEL4x8_L2 128,64,9,0
+ KERNEL4x8_L2 128,64,10,0
+ KERNEL4x8_L2 128,64,11,0
+ dcbt BO, T4
+ KERNEL4x8_L2 128,64,12,0
+ KERNEL4x8_L2 128,64,13,0
+ KERNEL4x8_L2 128,64,14,0
+ KERNEL4x8_L2 128,64,15,0
+ KERNEL4x8_L2 128,64,16,0
+ KERNEL4x8_L2 128,64,17,0
+ KERNEL4x8_L2 128,64,18,0
+ KERNEL4x8_L2 128,64,19,0
+ KERNEL4x8_L2 128,64,20,0
+ KERNEL4x8_L2 128,64,21,0
+ KERNEL4x8_L2 128,64,22,0
+ KERNEL4x8_L2 128,64,23,0
+ KERNEL4x8_L2 128,64,24,0
+ KERNEL4x8_L2 128,64,25,0
+ KERNEL4x8_L2 128,64,26,0
+ KERNEL4x8_L2 128,64,27,0
+ KERNEL4x8_L2 128,64,28,0
+ KERNEL4x8_L2 128,64,29,0
+ KERNEL4x8_L2 128,64,30,0
+ KERNEL4x8_E2 128,64,31,1
+ blr
+ MY_ALIGN
+
+
+CGEMM_4x8_L32_SUB:
+/*----------------------------------------*/
+ LOAD4x8_2
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL4x8_L2 128,64,0,0
+ KERNEL4x8_L2 128,64,1,0
+ dcbt AO, T2
+ KERNEL4x8_L2 128,64,2,0
+ KERNEL4x8_L2 128,64,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL4x8_L2 128,64,4,0
+ KERNEL4x8_L2 128,64,5,0
+ dcbt AO, T4
+ KERNEL4x8_L2 128,64,6,0
+ KERNEL4x8_L2 128,64,7,0
+ dcbt AO, T5
+ dcbt BO, T3
+ KERNEL4x8_L2 128,64,8,0
+ KERNEL4x8_L2 128,64,9,0
+ KERNEL4x8_L2 128,64,10,0
+ KERNEL4x8_L2 128,64,11,0
+ dcbt BO, T4
+ KERNEL4x8_L2 128,64,12,0
+ KERNEL4x8_L2 128,64,13,0
+ KERNEL4x8_L2 128,64,14,0
+ KERNEL4x8_E2 128,64,15,1
+ blr
+ MY_ALIGN
+
+
+CGEMM_4x8_L16_SUB:
+/*----------------------------------------*/
+ LOAD4x8_2
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL4x8_L2 128,64,0,0
+ KERNEL4x8_L2 128,64,1,0
+ dcbt AO, T2
+ KERNEL4x8_L2 128,64,2,0
+ KERNEL4x8_L2 128,64,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL4x8_L2 128,64,4,0
+ KERNEL4x8_L2 128,64,5,0
+ dcbt AO, T4
+ KERNEL4x8_L2 128,64,6,0
+ KERNEL4x8_E2 128,64,7,1
+ blr
+ MY_ALIGN
+
+
+CGEMM_4x4_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD4x4_2
+ MY_ALIGN
+CGEMM_L4x4_LOOP:
+/*----------------------------------------*/
+ KERNEL4x4_L2 64,64,0,0
+CGEMM_L4x4_K32:
+/*----------------------------------------*/
+ KERNEL4x4_L2 64,64,1,0
+ KERNEL4x4_L2 64,64,2,0
+ KERNEL4x4_L2 64,64,3,0
+ KERNEL4x4_L2 64,64,4,0
+ KERNEL4x4_L2 64,64,5,0
+ KERNEL4x4_L2 64,64,6,0
+ KERNEL4x4_L2 64,64,7,0
+ KERNEL4x4_L2 64,64,8,0
+ KERNEL4x4_L2 64,64,9,0
+ KERNEL4x4_L2 64,64,10,0
+ KERNEL4x4_L2 64,64,11,0
+ KERNEL4x4_L2 64,64,12,0
+ KERNEL4x4_L2 64,64,13,0
+ KERNEL4x4_L2 64,64,14,0
+ KERNEL4x4_L2 64,64,15,1
+ bdnz CGEMM_L4x4_LOOP
+ MY_ALIGN
+CGEMM_L4x4_LOOP_END:
+/*----------------------------------------*/
+ END4x4_2
+ blr
+ MY_ALIGN
+
+
+CGEMM_4x4_L16_SUB:
+/*----------------------------------------*/
+ LOAD4x4_2
+ KERNEL4x4_L2 64,64,0,0
+ KERNEL4x4_L2 64,64,1,0
+ KERNEL4x4_L2 64,64,2,0
+ KERNEL4x4_L2 64,64,3,0
+ KERNEL4x4_L2 64,64,4,0
+ KERNEL4x4_L2 64,64,5,0
+ KERNEL4x4_L2 64,64,6,0
+ KERNEL4x4_E2 64,64,7,1
+ blr
+ MY_ALIGN
+
+
+CGEMM_4x4_L8_SUB:
+/*----------------------------------------*/
+ LOAD4x4_2
+ KERNEL4x4_L2 64,64,0,0
+ KERNEL4x4_L2 64,64,1,0
+ KERNEL4x4_L2 64,64,2,0
+ KERNEL4x4_E2 64,64,3,1
+ blr
+
+
+CGEMM_4x2_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD4x2_2
+ MY_ALIGN
+CGEMM_L4x2_LOOP:
+/*----------------------------------------*/
+ KERNEL4x2_L2 32,64,0,0
+CGEMM_L4x2_K32:
+/*----------------------------------------*/
+ KERNEL4x2_L2 32,64,1,0
+ KERNEL4x2_L2 32,64,2,0
+ KERNEL4x2_L2 32,64,3,0
+ KERNEL4x2_L2 32,64,4,0
+ KERNEL4x2_L2 32,64,5,0
+ KERNEL4x2_L2 32,64,6,0
+ KERNEL4x2_L2 32,64,7,0
+ KERNEL4x2_L2 32,64,8,0
+ KERNEL4x2_L2 32,64,9,0
+ KERNEL4x2_L2 32,64,10,0
+ KERNEL4x2_L2 32,64,11,0
+ KERNEL4x2_L2 32,64,12,0
+ KERNEL4x2_L2 32,64,13,0
+ KERNEL4x2_L2 32,64,14,0
+ KERNEL4x2_L2 32,64,15,1
+ bdnz CGEMM_L4x2_LOOP
+ MY_ALIGN
+
+
+CGEMM_L4x2_LOOP_END:
+/*----------------------------------------*/
+ END4x2_2
+ blr
+ MY_ALIGN
+CGEMM_4x2_L16_SUB:
+/*----------------------------------------*/
+ LOAD4x2_2
+ KERNEL4x2_L2 32,64,0,0
+ KERNEL4x2_L2 32,64,1,0
+ KERNEL4x2_L2 32,64,2,0
+ KERNEL4x2_L2 32,64,3,0
+ KERNEL4x2_L2 32,64,4,0
+ KERNEL4x2_L2 32,64,5,0
+ KERNEL4x2_L2 32,64,6,0
+ KERNEL4x2_E2 32,64,7,1
+ blr
+ MY_ALIGN
+CGEMM_4x2_L8_SUB:
+/*----------------------------------------*/
+ LOAD4x2_2
+ KERNEL4x2_L2 32,64,0,0
+ KERNEL4x2_L2 32,64,1,0
+ KERNEL4x2_L2 32,64,2,0
+ KERNEL4x2_E2 32,64,3,1
+ blr
+
+
+CGEMM_4x1_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD4x1_2
+ MY_ALIGN
+CGEMM_L4x1_LOOP:
+/*----------------------------------------*/
+ KERNEL4x1_L2 16,64,0,0
+CGEMM_L4x1_K32:
+/*----------------------------------------*/
+ KERNEL4x1_L2 16,64,1,0
+ KERNEL4x1_L2 16,64,2,0
+ KERNEL4x1_L2 16,64,3,0
+ KERNEL4x1_L2 16,64,4,0
+ KERNEL4x1_L2 16,64,5,0
+ KERNEL4x1_L2 16,64,6,0
+ KERNEL4x1_L2 16,64,7,0
+ KERNEL4x1_L2 16,64,8,0
+ KERNEL4x1_L2 16,64,9,0
+ KERNEL4x1_L2 16,64,10,0
+ KERNEL4x1_L2 16,64,11,0
+ KERNEL4x1_L2 16,64,12,0
+ KERNEL4x1_L2 16,64,13,0
+ KERNEL4x1_L2 16,64,14,0
+ KERNEL4x1_L2 16,64,15,1
+ bdnz CGEMM_L4x1_LOOP
+ MY_ALIGN
+CGEMM_L4x1_LOOP_END:
+/*----------------------------------------*/
+ END4x1_2
+ blr
+
+ MY_ALIGN
+CGEMM_4x1_L16_SUB:
+/*----------------------------------------*/
+ LOAD4x1_2
+ KERNEL4x1_L2 16,64,0,0
+ KERNEL4x1_L2 16,64,1,0
+ KERNEL4x1_L2 16,64,2,0
+ KERNEL4x1_L2 16,64,3,0
+ KERNEL4x1_L2 16,64,4,0
+ KERNEL4x1_L2 16,64,5,0
+ KERNEL4x1_L2 16,64,6,0
+ KERNEL4x1_E2 16,64,7,1
+ blr
+ MY_ALIGN
+
+
+CGEMM_4x1_L8_SUB:
+/*----------------------------------------*/
+ LOAD4x1_2
+ KERNEL4x1_L2 16,64,0,0
+ KERNEL4x1_L2 16,64,1,0
+ KERNEL4x1_L2 16,64,2,0
+ KERNEL4x1_E2 16,64,3,1
+ blr
+
+
+
+/* MAIN LOOP BEGINS */
+ MY_ALIGN
+
+
+CGEMM_L4:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ neg TEMP_REG, OFFSET
+#endif
+ /* Pre set value in vs57 as 0xffff0000ffff0000 for masking */
+ vspltisb v24, -1
+ vspltisb v25, 0
+ xxsldwi vs57, vs56, vs57, 1
+ xxpermdi vs57, vs57, vs57, 3
+ srawi. J, N, 2
+ ble CGEMM_L4_END
+
+
+CGEMM_L4_BEGIN:
+/*----------------------------------------*/
+ mr CO, C
+ slwi T1, LDC , 2
+ add T2,C,LDC
+ mr AO, A
+ add C, C, T1
+#if defined(TRMMKERNEL) && defined(LEFT)
+ mr TEMP_REG, OFFSET /*off = offset;*/
+#endif
+ srawi. I, M, 3
+ ble CGEMM_L4x8_END
+ dcbt CO,r0 /*just prefetch*/
+ dcbt T2,r0
+
+
+CGEMM_L4x8_BEGIN:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,8,4
+#else
+ mr BO, B
+ dcbt B, r0
+#endif
+ dcbt AO, r0
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,8,4
+ mr T1, T6
+/* TEMPS FOR PREFETCH */
+ li T2, 1024
+ li T3, 1024+512
+ addi T1,T1, -2
+/* TEMPS FOR PREFETCH */
+ li T4, 2048
+ li T5, 2048+512
+ srawi. T8, T1, 7 /**(T1-2) % 128x */
+#else
+ mr T1, K
+/* TEMPS FOR PREFETCH */
+ li T2, 1024
+ li T3, 1024+512
+ addi T1,T1, -2
+/* TEMPS FOR PREFETCH */
+ li T4, 2048
+ li T5, 2048+512
+ srawi. T8, T1, 7 /**(K-2) % 128x */
+#endif
+ ZERO4x8
+ ble CGEMM_L4x8_SUB0
+ bl CGEMM_L4x8_LMAIN_SUB
+ andi. L, T1, 127
+ ble CGEMM_L4x8_SAVE
+ b CGEMM_L4x8_SUB2
+
+
+CGEMM_L4x8_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 255
+ cmpwi T6,129
+#else
+ andi. L, K, 255
+ cmpwi K,129
+#endif
+ li T8,1
+ bne CMP4x8_128K
+ addi BO,BO,-32
+ addi AO,AO,-64
+ LOAD4x8O 64,32
+ END4x8_WITHOUT_ADD
+ LOAD4x8_2O 128, 64
+ mtctr T8
+ bl CGEMM_L4x8_K128
+ b CGEMM_L4x8_SAVE
+ CMP4x8_128K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,128
+#else
+ cmpwi K,128
+#endif
+ bne CGEMM_L4x8_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-64
+ addi AO,AO,-128
+ LOAD4x8_2O 128,64
+ bl CGEMM_L4x8_K128
+ b CGEMM_L4x8_SAVE
+ MY_ALIGN
+
+
+CGEMM_L4x8_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 64
+ ble CGEMM_L4x8_SUB2_32
+ bl CGEMM_4x8_L64_SUB
+ MY_ALIGN
+
+
+CGEMM_L4x8_SUB2_32:
+/*----------------------------------------*/
+ andi. T1,L, 32
+ ble CGEMM_L4x8_SUB2_16
+ bl CGEMM_4x8_L32_SUB
+ MY_ALIGN
+
+
+CGEMM_L4x8_SUB2_16:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble CGEMM_L4x8_SUB2_8
+ bl CGEMM_4x8_L16_SUB
+ MY_ALIGN
+
+
+CGEMM_L4x8_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble CGEMM_L4x8_SUB2_4
+ LOAD4x8_2
+ KERNEL4x8_L2 128,64, 0,0
+ KERNEL4x8_L2 128,64, 1,0
+ KERNEL4x8_L2 128,64, 2,0
+ KERNEL4x8_E2 128,64, 3,1
+ MY_ALIGN
+
+
+CGEMM_L4x8_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble CGEMM_L4x8_SUB2_2
+ LOAD4x8_2
+ KERNEL4x8_L2 128,64, 0,0
+ KERNEL4x8_E2 128,64, 1,1
+ MY_ALIGN
+
+
+CGEMM_L4x8_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble CGEMM_L4x8_SUB2_1
+ LOAD4x8_2
+ KERNEL4x8_E2 128,64, 0,1
+ MY_ALIGN
+
+
+CGEMM_L4x8_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble CGEMM_L4x8_SAVE
+ KERNEL4x8
+
+ MY_ALIGN
+CGEMM_L4x8_SAVE:
+/*----------------------------------------*/
+ addic. I, I, -1
+ MY_ALIGN
+ SAVE4x8
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,4
+#endif
+ bgt CGEMM_L4x8_BEGIN
+ andi. T2, M, 7
+ ble CGEMM_L4x1_END
+ andi. T1, M, 4
+ ble CGEMM_L4x4_END
+ b CGEMM_L4x4_BEGIN
+ MY_ALIGN
+
+
+CGEMM_L4x8_END:
+/*----------------------------------------*/
+
+
+CGEMM_L4x4_BEGIN:
+/*----------------------------------------*/
+ andi. T2, M, 7
+ ble CGEMM_L4x1_END
+ andi. T1, M, 4
+ ble CGEMM_L4x4_END
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,4,4
+#else
+ mr BO, B
+#endif
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,4,4
+ mr T1, T6
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(T1-2) % 32x */
+#else
+ mr T1, K
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(K-2) % 32x */
+#endif
+ ZERO4x4
+ ble CGEMM_L4x4_SUB0
+ bl CGEMM_4x4_LMAIN_SUB
+ andi. L, T1, 31
+ ble CGEMM_L4x4_SAVE
+ b CGEMM_L4x4_SUB2
+
+
+CGEMM_L4x4_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 63
+ cmpwi T6,33
+#else
+ andi. L, K, 63
+ cmpwi K,33
+#endif
+ li T8,1
+ bne CMP4x4_32K
+ addi BO,BO,-32
+ addi AO,AO,-32
+ LOAD4x4O 32,32
+ END4x4_WITHOUT_ADD
+ LOAD4x4_2O 64, 64
+ mtctr T8
+ bl CGEMM_L4x4_K32
+ b CGEMM_L4x4_SAVE
+ CMP4x4_32K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,32
+#else
+ cmpwi K,32
+#endif
+ bne CGEMM_L4x4_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-64
+ addi AO,AO,-64
+ LOAD4x4_2O 64,64
+ bl CGEMM_L4x4_K32
+ b CGEMM_L4x4_SAVE
+ MY_ALIGN
+ MY_ALIGN
+
+
+CGEMM_L4x4_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble CGEMM_L4x4_SUB2_8
+ bl CGEMM_4x4_L16_SUB
+ MY_ALIGN
+
+
+CGEMM_L4x4_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble CGEMM_L4x4_SUB2_4
+ bl CGEMM_4x4_L8_SUB
+ MY_ALIGN
+
+
+CGEMM_L4x4_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble CGEMM_L4x4_SUB2_2
+ LOAD4x4_2
+ KERNEL4x4_L2 64,64, 0,0
+ KERNEL4x4_E2 64,64, 1,1
+ MY_ALIGN
+
+
+CGEMM_L4x4_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble CGEMM_L4x4_SUB2_1
+ LOAD4x4_2
+ KERNEL4x4_E2 64,64, 0,1
+ MY_ALIGN
+
+
+CGEMM_L4x4_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble CGEMM_L4x4_SAVE
+ KERNEL4x4
+
+
+CGEMM_L4x4_SAVE:
+/*----------------------------------------*/
+ SAVE4x4
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,4
+#endif
+
+
+CGEMM_L4x4_END:
+/*----------------------------------------*/
+
+
+CGEMM_L4x2_BEGIN:
+/*----------------------------------------*/
+ andi. T1, M, 2
+ ble CGEMM_L4x2_END
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,2,4
+#else
+ mr BO, B
+#endif
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,2,4
+ mr T1, T6
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(T1-2) % 32x */
+#else
+ mr T1, K
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(K-2) % 32x */
+#endif
+ ZERO4x2
+ ble CGEMM_L4x2_SUB0
+ bl CGEMM_4x2_LMAIN_SUB
+ andi. L, T1, 31
+ ble CGEMM_L4x2_SAVE
+ b CGEMM_L4x2_SUB2
+
+
+CGEMM_L4x2_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 63
+ cmpwi T6,33
+#else
+ andi. L, K, 63
+ cmpwi K,33
+#endif
+ li T8,1
+ bne CMP4x2_32K
+ addi BO,BO,-32
+ addi AO,AO,-16
+ LOAD4x2O 16,32
+ END4x2_WITHOUT_ADD
+ LOAD4x2_2O 32, 64
+ mtctr T8
+ bl CGEMM_L4x2_K32
+ b CGEMM_L4x2_SAVE
+ CMP4x2_32K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,32
+#else
+ cmpwi K,32
+#endif
+ bne CGEMM_L4x2_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-64
+ addi AO,AO,-32
+ LOAD4x2_2O 32,64
+ bl CGEMM_L4x2_K32
+ b CGEMM_L4x2_SAVE
+ MY_ALIGN
+ MY_ALIGN
+
+
+CGEMM_L4x2_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble CGEMM_L4x2_SUB2_8
+ bl CGEMM_4x2_L16_SUB
+ MY_ALIGN
+
+
+CGEMM_L4x2_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble CGEMM_L4x2_SUB2_4
+ bl CGEMM_4x2_L8_SUB
+ MY_ALIGN
+
+
+CGEMM_L4x2_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble CGEMM_L4x2_SUB2_2
+ LOAD4x2_2
+ KERNEL4x2_L2 32,64, 0,0
+ KERNEL4x2_E2 32,64, 1,1
+ MY_ALIGN
+
+
+CGEMM_L4x2_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble CGEMM_L4x2_SUB2_1
+ LOAD4x2_2
+ KERNEL4x2_E2 32,64, 0,1
+ MY_ALIGN
+
+
+CGEMM_L4x2_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble CGEMM_L4x2_SAVE
+ KERNEL4x2
+
+ MY_ALIGN
+CGEMM_L4x2_SAVE:
+/*----------------------------------------*/
+ SAVE4x2
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,4
+#endif
+
+
+CGEMM_L4x2_END:
+/*----------------------------------------*/
+
+
+CGEMM_L4x1_BEGIN:
+/*----------------------------------------*/
+ andi. T1, M, 1
+ ble CGEMM_L4x1_END
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,1,4
+#else
+ mr BO, B
+#endif
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,1,4
+ mr T1, T6
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(T1-2) % 32x */
+#else
+ mr T1, K
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(K-2) % 32x */
+#endif
+ ZERO4x1
+ ble CGEMM_L4x1_SUB0
+ bl CGEMM_4x1_LMAIN_SUB
+ andi. L, T1, 31
+ ble CGEMM_L4x1_SAVE
+ b CGEMM_L4x1_SUB2
+
+
+CGEMM_L4x1_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 63
+ cmpwi T6,33
+#else
+ andi. L, K, 63
+ cmpwi K,33
+#endif
+ li T8,1
+ bne CMP4x1_32K
+ addi BO,BO,-32
+ addi AO,AO,-8
+ LOAD4x1O 8,32
+ END4x1_WITHOUT_ADD
+ LOAD4x1_2O 16, 64
+ mtctr T8
+ bl CGEMM_L4x1_K32
+ b CGEMM_L4x1_SAVE
+ CMP4x1_32K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,32
+#else
+ cmpwi K,32
+#endif
+ bne CGEMM_L4x1_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-64
+ addi AO,AO,-16
+ LOAD4x1_2O 16,64
+ bl CGEMM_L4x1_K32
+ b CGEMM_L4x1_SAVE
+ MY_ALIGN
+ MY_ALIGN
+
+
+CGEMM_L4x1_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble CGEMM_L4x1_SUB2_8
+ bl CGEMM_4x1_L16_SUB
+ MY_ALIGN
+
+
+CGEMM_L4x1_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble CGEMM_L4x1_SUB2_4
+ bl CGEMM_4x1_L8_SUB
+ MY_ALIGN
+
+
+CGEMM_L4x1_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble CGEMM_L4x1_SUB2_2
+ LOAD4x1_2
+ KERNEL4x1_L2 16,64, 0,0
+ KERNEL4x1_E2 16,64, 1,1
+ MY_ALIGN
+
+
+CGEMM_L4x1_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble CGEMM_L4x1_SUB2_1
+ LOAD4x1_2
+ KERNEL4x1_E2 16,64, 0,1
+ MY_ALIGN
+
+
+CGEMM_L4x1_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble CGEMM_L4x1_SAVE
+ KERNEL4x1
+
+ MY_ALIGN
+CGEMM_L4x1_SAVE:
+/*----------------------------------------*/
+
+ SAVE4x1
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,4
+#endif
+
+
+CGEMM_L4x1_END:
+/*----------------------------------------*/
+ slwi T1, K, 5
+ addic. J, J, -1
+ add B, B, T1
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ addi TEMP_REG, TEMP_REG, 4
+#endif
+ bgt CGEMM_L4_BEGIN
+
+
+CGEMM_L4_END:
+
+b CGEMM_L2
+/* MINI SUBROUTINES */
+/* 2x8 MAIN 128x+2 LOOP */
+
+
+CGEMM_L2x8_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD2x8_2
+ MY_ALIGN
+CGEMM_L2x8_LOOP:
+/*----------------------------------------*/
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL2x8_L2 128,32,0,0
+CGEMM_L2x8_K128:
+/*----------------------------------------*/
+ KERNEL2x8_L2 128,32,1,0
+ dcbt AO, T2
+ KERNEL2x8_L2 128,32,2,0
+ KERNEL2x8_L2 128,32,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL2x8_L2 128,32,4,0
+ KERNEL2x8_L2 128,32,5,0
+ dcbt AO, T4
+ KERNEL2x8_L2 128,32,6,0
+ KERNEL2x8_L2 128,32,7,0
+ dcbt AO, T5
+ dcbt BO, T3
+ KERNEL2x8_L2 128,32,8,0
+ KERNEL2x8_L2 128,32,9,0
+ KERNEL2x8_L2 128,32,10,0
+ KERNEL2x8_L2 128,32,11,0
+ dcbt BO, T4
+ KERNEL2x8_L2 128,32,12,0
+ KERNEL2x8_L2 128,32,13,0
+ KERNEL2x8_L2 128,32,14,0
+ KERNEL2x8_L2 128,32,15,0
+ KERNEL2x8_L2 128,32,16,0
+ KERNEL2x8_L2 128,32,17,0
+ KERNEL2x8_L2 128,32,18,0
+ KERNEL2x8_L2 128,32,19,0
+ KERNEL2x8_L2 128,32,20,0
+ KERNEL2x8_L2 128,32,21,0
+ KERNEL2x8_L2 128,32,22,0
+ KERNEL2x8_L2 128,32,23,0
+ KERNEL2x8_L2 128,32,24,0
+ KERNEL2x8_L2 128,32,25,0
+ KERNEL2x8_L2 128,32,26,0
+ KERNEL2x8_L2 128,32,27,0
+ KERNEL2x8_L2 128,32,28,0
+ KERNEL2x8_L2 128,32,29,0
+ KERNEL2x8_L2 128,32,30,0
+ KERNEL2x8_L2 128,32,31,0
+ KERNEL2x8_L2 128,32,32,0
+ KERNEL2x8_L2 128,32,33,0
+ KERNEL2x8_L2 128,32,34,0
+ KERNEL2x8_L2 128,32,35,0
+ KERNEL2x8_L2 128,32,36,0
+ KERNEL2x8_L2 128,32,37,0
+ KERNEL2x8_L2 128,32,38,0
+ KERNEL2x8_L2 128,32,39,0
+ KERNEL2x8_L2 128,32,40,0
+ KERNEL2x8_L2 128,32,41,0
+ KERNEL2x8_L2 128,32,42,0
+ KERNEL2x8_L2 128,32,43,0
+ KERNEL2x8_L2 128,32,44,0
+ KERNEL2x8_L2 128,32,45,0
+ KERNEL2x8_L2 128,32,46,0
+ KERNEL2x8_L2 128,32,47,0
+ KERNEL2x8_L2 128,32,48,0
+ KERNEL2x8_L2 128,32,49,0
+ KERNEL2x8_L2 128,32,50,0
+ KERNEL2x8_L2 128,32,51,0
+ KERNEL2x8_L2 128,32,52,0
+ KERNEL2x8_L2 128,32,53,0
+ KERNEL2x8_L2 128,32,54,0
+ KERNEL2x8_L2 128,32,55,0
+ KERNEL2x8_L2 128,32,56,0
+ KERNEL2x8_L2 128,32,57,0
+ KERNEL2x8_L2 128,32,58,0
+ KERNEL2x8_L2 128,32,59,0
+ KERNEL2x8_L2 128,32,60,0
+ KERNEL2x8_L2 128,32,61,0
+ KERNEL2x8_L2 128,32,62,0
+ KERNEL2x8_L2 128,32,63,1
+ bdnz CGEMM_L2x8_LOOP
+ MY_ALIGN
+CGEMM_L2x8_LOOP_END:
+/*----------------------------------------*/
+ END2x8_2
+ blr
+ MY_ALIGN
+
+
+CGEMM_2x8_L64_SUB:
+/*----------------------------------------*/
+ LOAD2x8_2
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL2x8_L2 128,32,0,0
+ KERNEL2x8_L2 128,32,1,0
+ dcbt AO, T2
+ KERNEL2x8_L2 128,32,2,0
+ KERNEL2x8_L2 128,32,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL2x8_L2 128,32,4,0
+ KERNEL2x8_L2 128,32,5,0
+ dcbt AO, T4
+ KERNEL2x8_L2 128,32,6,0
+ KERNEL2x8_L2 128,32,7,0
+ dcbt AO, T5
+ dcbt BO, T3
+ KERNEL2x8_L2 128,32,8,0
+ KERNEL2x8_L2 128,32,9,0
+ KERNEL2x8_L2 128,32,10,0
+ KERNEL2x8_L2 128,32,11,0
+ dcbt BO, T4
+ KERNEL2x8_L2 128,32,12,0
+ KERNEL2x8_L2 128,32,13,0
+ KERNEL2x8_L2 128,32,14,0
+ KERNEL2x8_L2 128,32,15,0
+ KERNEL2x8_L2 128,32,16,0
+ KERNEL2x8_L2 128,32,17,0
+ KERNEL2x8_L2 128,32,18,0
+ KERNEL2x8_L2 128,32,19,0
+ KERNEL2x8_L2 128,32,20,0
+ KERNEL2x8_L2 128,32,21,0
+ KERNEL2x8_L2 128,32,22,0
+ KERNEL2x8_L2 128,32,23,0
+ KERNEL2x8_L2 128,32,24,0
+ KERNEL2x8_L2 128,32,25,0
+ KERNEL2x8_L2 128,32,26,0
+ KERNEL2x8_L2 128,32,27,0
+ KERNEL2x8_L2 128,32,28,0
+ KERNEL2x8_L2 128,32,29,0
+ KERNEL2x8_L2 128,32,30,0
+ KERNEL2x8_E2 128,32,31,1
+ blr
+ MY_ALIGN
+
+
+CGEMM_2x8_L32_SUB:
+/*----------------------------------------*/
+ LOAD2x8_2
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL2x8_L2 128,32,0,0
+ KERNEL2x8_L2 128,32,1,0
+ dcbt AO, T2
+ KERNEL2x8_L2 128,32,2,0
+ KERNEL2x8_L2 128,32,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL2x8_L2 128,32,4,0
+ KERNEL2x8_L2 128,32,5,0
+ dcbt AO, T4
+ KERNEL2x8_L2 128,32,6,0
+ KERNEL2x8_L2 128,32,7,0
+ dcbt AO, T5
+ dcbt BO, T3
+ KERNEL2x8_L2 128,32,8,0
+ KERNEL2x8_L2 128,32,9,0
+ KERNEL2x8_L2 128,32,10,0
+ KERNEL2x8_L2 128,32,11,0
+ dcbt BO, T4
+ KERNEL2x8_L2 128,32,12,0
+ KERNEL2x8_L2 128,32,13,0
+ KERNEL2x8_L2 128,32,14,0
+ KERNEL2x8_E2 128,32,15,1
+ blr
+ MY_ALIGN
+
+
+CGEMM_2x8_L16_SUB:
+/*----------------------------------------*/
+ LOAD2x8_2
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL2x8_L2 128,32,0,0
+ KERNEL2x8_L2 128,32,1,0
+ dcbt AO, T2
+ KERNEL2x8_L2 128,32,2,0
+ KERNEL2x8_L2 128,32,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL2x8_L2 128,32,4,0
+ KERNEL2x8_L2 128,32,5,0
+ dcbt AO, T4
+ KERNEL2x8_L2 128,32,6,0
+ KERNEL2x8_E2 128,32,7,1
+ blr
+ MY_ALIGN
+
+
+CGEMM_2x4_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD2x4_2
+ MY_ALIGN
+CGEMM_L2x4_LOOP:
+/*----------------------------------------*/
+ KERNEL2x4_L2 64,32,0,0
+CGEMM_L2x4_K32:
+/*----------------------------------------*/
+ KERNEL2x4_L2 64,32,1,0
+ KERNEL2x4_L2 64,32,2,0
+ KERNEL2x4_L2 64,32,3,0
+ KERNEL2x4_L2 64,32,4,0
+ KERNEL2x4_L2 64,32,5,0
+ KERNEL2x4_L2 64,32,6,0
+ KERNEL2x4_L2 64,32,7,0
+ KERNEL2x4_L2 64,32,8,0
+ KERNEL2x4_L2 64,32,9,0
+ KERNEL2x4_L2 64,32,10,0
+ KERNEL2x4_L2 64,32,11,0
+ KERNEL2x4_L2 64,32,12,0
+ KERNEL2x4_L2 64,32,13,0
+ KERNEL2x4_L2 64,32,14,0
+ KERNEL2x4_L2 64,32,15,1
+ bdnz CGEMM_L2x4_LOOP
+ MY_ALIGN
+CGEMM_L2x4_LOOP_END:
+/*----------------------------------------*/
+ END2x4_2
+ blr
+ MY_ALIGN
+
+
+CGEMM_2x4_L16_SUB:
+/*----------------------------------------*/
+ LOAD2x4_2
+ KERNEL2x4_L2 64,32,0,0
+ KERNEL2x4_L2 64,32,1,0
+ KERNEL2x4_L2 64,32,2,0
+ KERNEL2x4_L2 64,32,3,0
+ KERNEL2x4_L2 64,32,4,0
+ KERNEL2x4_L2 64,32,5,0
+ KERNEL2x4_L2 64,32,6,0
+ KERNEL2x4_E2 64,32,7,1
+ blr
+ MY_ALIGN
+
+
+CGEMM_2x4_L8_SUB:
+/*----------------------------------------*/
+ LOAD2x4_2
+ KERNEL2x4_L2 64,32,0,0
+ KERNEL2x4_L2 64,32,1,0
+ KERNEL2x4_L2 64,32,2,0
+ KERNEL2x4_E2 64,32,3,1
+ blr
+
+
+CGEMM_2x2_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD2x2_2
+ MY_ALIGN
+CGEMM_L2x2_LOOP:
+/*----------------------------------------*/
+ KERNEL2x2_L2 32,32,0,0
+CGEMM_L2x2_K32:
+/*----------------------------------------*/
+ KERNEL2x2_L2 32,32,1,0
+ KERNEL2x2_L2 32,32,2,0
+ KERNEL2x2_L2 32,32,3,0
+ KERNEL2x2_L2 32,32,4,0
+ KERNEL2x2_L2 32,32,5,0
+ KERNEL2x2_L2 32,32,6,0
+ KERNEL2x2_L2 32,32,7,0
+ KERNEL2x2_L2 32,32,8,0
+ KERNEL2x2_L2 32,32,9,0
+ KERNEL2x2_L2 32,32,10,0
+ KERNEL2x2_L2 32,32,11,0
+ KERNEL2x2_L2 32,32,12,0
+ KERNEL2x2_L2 32,32,13,0
+ KERNEL2x2_L2 32,32,14,0
+ KERNEL2x2_L2 32,32,15,1
+ bdnz CGEMM_L2x2_LOOP
+ MY_ALIGN
+
+
+CGEMM_L2x2_LOOP_END:
+/*----------------------------------------*/
+ END2x2_2
+ blr
+ MY_ALIGN
+CGEMM_2x2_L16_SUB:
+/*----------------------------------------*/
+ LOAD2x2_2
+ KERNEL2x2_L2 32,32,0,0
+ KERNEL2x2_L2 32,32,1,0
+ KERNEL2x2_L2 32,32,2,0
+ KERNEL2x2_L2 32,32,3,0
+ KERNEL2x2_L2 32,32,4,0
+ KERNEL2x2_L2 32,32,5,0
+ KERNEL2x2_L2 32,32,6,0
+ KERNEL2x2_E2 32,32,7,1
+ blr
+ MY_ALIGN
+CGEMM_2x2_L8_SUB:
+/*----------------------------------------*/
+ LOAD2x2_2
+ KERNEL2x2_L2 32,32,0,0
+ KERNEL2x2_L2 32,32,1,0
+ KERNEL2x2_L2 32,32,2,0
+ KERNEL2x2_E2 32,32,3,1
+ blr
+
+
+CGEMM_2x1_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD2x1_2
+ MY_ALIGN
+CGEMM_L2x1_LOOP:
+/*----------------------------------------*/
+ KERNEL2x1_L2 16,32,0,0
+CGEMM_L2x1_K32:
+/*----------------------------------------*/
+ KERNEL2x1_L2 16,32,1,0
+ KERNEL2x1_L2 16,32,2,0
+ KERNEL2x1_L2 16,32,3,0
+ KERNEL2x1_L2 16,32,4,0
+ KERNEL2x1_L2 16,32,5,0
+ KERNEL2x1_L2 16,32,6,0
+ KERNEL2x1_L2 16,32,7,0
+ KERNEL2x1_L2 16,32,8,0
+ KERNEL2x1_L2 16,32,9,0
+ KERNEL2x1_L2 16,32,10,0
+ KERNEL2x1_L2 16,32,11,0
+ KERNEL2x1_L2 16,32,12,0
+ KERNEL2x1_L2 16,32,13,0
+ KERNEL2x1_L2 16,32,14,0
+ KERNEL2x1_L2 16,32,15,1
+ bdnz CGEMM_L2x1_LOOP
+ MY_ALIGN
+CGEMM_L2x1_LOOP_END:
+/*----------------------------------------*/
+ END2x1_2
+ blr
+
+ MY_ALIGN
+CGEMM_2x1_L16_SUB:
+/*----------------------------------------*/
+ LOAD2x1_2
+ KERNEL2x1_L2 16,32,0,0
+ KERNEL2x1_L2 16,32,1,0
+ KERNEL2x1_L2 16,32,2,0
+ KERNEL2x1_L2 16,32,3,0
+ KERNEL2x1_L2 16,32,4,0
+ KERNEL2x1_L2 16,32,5,0
+ KERNEL2x1_L2 16,32,6,0
+ KERNEL2x1_E2 16,32,7,1
+ blr
+ MY_ALIGN
+
+
+CGEMM_2x1_L8_SUB:
+/*----------------------------------------*/
+ LOAD2x1_2
+ KERNEL2x1_L2 16,32,0,0
+ KERNEL2x1_L2 16,32,1,0
+ KERNEL2x1_L2 16,32,2,0
+ KERNEL2x1_E2 16,32,3,1
+ blr
+
+
+
+/* MAIN LOOP BEGINS */
+ MY_ALIGN
+
+
+CGEMM_L2:
+/*----------------------------------------*/
+
+ andi. J, N, 2
+ ble CGEMM_L2_END
+
+
+CGEMM_L2_BEGIN:
+/*----------------------------------------*/
+ mr CO, C
+ slwi T1, LDC , 1
+ add T2,C,LDC
+ mr AO, A
+ add C, C, T1
+#if defined(TRMMKERNEL) && defined(LEFT)
+ mr TEMP_REG, OFFSET /*off = offset;*/
+#endif
+ srawi. I, M, 3
+ ble CGEMM_L2x8_END
+ dcbt CO,r0 /*just prefetch*/
+ dcbt T2,r0
+
+
+CGEMM_L2x8_BEGIN:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2
+#else
+ mr BO, B
+ dcbt B, r0
+#endif
+ dcbt AO, r0
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,8,2
+ mr T1, T6
+/* TEMPS FOR PREFETCH */
+ li T2, 1024
+ li T3, 1024+512
+ addi T1,T1, -2
+/* TEMPS FOR PREFETCH */
+ li T4, 2048
+ li T5, 2048+512
+ srawi. T8, T1, 7 /**(T1-2) % 128x */
+#else
+ mr T1, K
+/* TEMPS FOR PREFETCH */
+ li T2, 1024
+ li T3, 1024+512
+ addi T1,T1, -2
+/* TEMPS FOR PREFETCH */
+ li T4, 2048
+ li T5, 2048+512
+ srawi. T8, T1, 7 /**(K-2) % 128x */
+#endif
+ ZERO2x8
+ ble CGEMM_L2x8_SUB0
+ bl CGEMM_L2x8_LMAIN_SUB
+ andi. L, T1, 127
+ ble CGEMM_L2x8_SAVE
+ b CGEMM_L2x8_SUB2
+
+
+CGEMM_L2x8_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 255
+ cmpwi T6,129
+#else
+ andi. L, K, 255
+ cmpwi K,129
+#endif
+ li T8,1
+ bne CMP2x8_128K
+ addi BO,BO,-16
+ addi AO,AO,-64
+ LOAD2x8O 64,16
+ END2x8_WITHOUT_ADD
+ LOAD2x8_2O 128, 32
+ mtctr T8
+ bl CGEMM_L2x8_K128
+ b CGEMM_L2x8_SAVE
+ CMP2x8_128K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,128
+#else
+ cmpwi K,128
+#endif
+ bne CGEMM_L2x8_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-32
+ addi AO,AO,-128
+ LOAD2x8_2O 128,32
+ bl CGEMM_L2x8_K128
+ b CGEMM_L2x8_SAVE
+ MY_ALIGN
+
+
+CGEMM_L2x8_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 64
+ ble CGEMM_L2x8_SUB2_32
+ bl CGEMM_2x8_L64_SUB
+ MY_ALIGN
+
+
+CGEMM_L2x8_SUB2_32:
+/*----------------------------------------*/
+ andi. T1,L, 32
+ ble CGEMM_L2x8_SUB2_16
+ bl CGEMM_2x8_L32_SUB
+ MY_ALIGN
+
+
+CGEMM_L2x8_SUB2_16:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble CGEMM_L2x8_SUB2_8
+ bl CGEMM_2x8_L16_SUB
+ MY_ALIGN
+
+
+CGEMM_L2x8_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble CGEMM_L2x8_SUB2_4
+ LOAD2x8_2
+ KERNEL2x8_L2 128,32, 0,0
+ KERNEL2x8_L2 128,32, 1,0
+ KERNEL2x8_L2 128,32, 2,0
+ KERNEL2x8_E2 128,32, 3,1
+ MY_ALIGN
+
+
+CGEMM_L2x8_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble CGEMM_L2x8_SUB2_2
+ LOAD2x8_2
+ KERNEL2x8_L2 128,32, 0,0
+ KERNEL2x8_E2 128,32, 1,1
+ MY_ALIGN
+
+
+CGEMM_L2x8_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble CGEMM_L2x8_SUB2_1
+ LOAD2x8_2
+ KERNEL2x8_E2 128,32, 0,1
+ MY_ALIGN
+
+
+CGEMM_L2x8_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble CGEMM_L2x8_SAVE
+ KERNEL2x8
+
+ MY_ALIGN
+CGEMM_L2x8_SAVE:
+/*----------------------------------------*/
+ addic. I, I, -1
+ MY_ALIGN
+ SAVE2x8
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,2
+#endif
+ bgt CGEMM_L2x8_BEGIN
+ andi. T2, M, 7
+ ble CGEMM_L2x1_END
+ andi. T1, M, 4
+ ble CGEMM_L2x4_END
+ b CGEMM_L2x4_BEGIN
+ MY_ALIGN
+
+
+CGEMM_L2x8_END:
+/*----------------------------------------*/
+
+
+CGEMM_L2x4_BEGIN:
+/*----------------------------------------*/
+ andi. T2, M, 7
+ ble CGEMM_L2x1_END
+ andi. T1, M, 4
+ ble CGEMM_L2x4_END
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2
+#else
+ mr BO, B
+#endif
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,4,2
+ mr T1, T6
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(T1-2) % 32x */
+#else
+ mr T1, K
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(K-2) % 32x */
+#endif
+ ZERO2x4
+ ble CGEMM_L2x4_SUB0
+ bl CGEMM_2x4_LMAIN_SUB
+ andi. L, T1, 31
+ ble CGEMM_L2x4_SAVE
+ b CGEMM_L2x4_SUB2
+
+
+CGEMM_L2x4_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 63
+ cmpwi T6,33
+#else
+ andi. L, K, 63
+ cmpwi K,33
+#endif
+ li T8,1
+ bne CMP2x4_32K
+ addi BO,BO,-16
+ addi AO,AO,-32
+ LOAD2x4O 32,16
+ END2x4_WITHOUT_ADD
+ LOAD2x4_2O 64, 32
+ mtctr T8
+ bl CGEMM_L2x4_K32
+ b CGEMM_L2x4_SAVE
+ CMP2x4_32K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,32
+#else
+ cmpwi K,32
+#endif
+ bne CGEMM_L2x4_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-32
+ addi AO,AO,-64
+ LOAD2x4_2O 64,32
+ bl CGEMM_L2x4_K32
+ b CGEMM_L2x4_SAVE
+ MY_ALIGN
+ MY_ALIGN
+
+
+CGEMM_L2x4_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble CGEMM_L2x4_SUB2_8
+ bl CGEMM_2x4_L16_SUB
+ MY_ALIGN
+
+
+CGEMM_L2x4_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble CGEMM_L2x4_SUB2_4
+ bl CGEMM_2x4_L8_SUB
+ MY_ALIGN
+
+
+CGEMM_L2x4_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble CGEMM_L2x4_SUB2_2
+ LOAD2x4_2
+ KERNEL2x4_L2 64,32, 0,0
+ KERNEL2x4_E2 64,32, 1,1
+ MY_ALIGN
+
+
+CGEMM_L2x4_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble CGEMM_L2x4_SUB2_1
+ LOAD2x4_2
+ KERNEL2x4_E2 64,32, 0,1
+ MY_ALIGN
+
+
+CGEMM_L2x4_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble CGEMM_L2x4_SAVE
+ KERNEL2x4
+
+
+CGEMM_L2x4_SAVE:
+/*----------------------------------------*/
+ SAVE2x4
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,2
+#endif
+
+
+CGEMM_L2x4_END:
+/*----------------------------------------*/
+
+
+CGEMM_L2x2_BEGIN:
+/*----------------------------------------*/
+ andi. T1, M, 2
+ ble CGEMM_L2x2_END
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2
+#else
+ mr BO, B
+#endif
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,2,2
+ mr T1, T6
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(T1-2) % 32x */
+#else
+ mr T1, K
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(K-2) % 32x */
+#endif
+ ZERO2x2
+ ble CGEMM_L2x2_SUB0
+ bl CGEMM_2x2_LMAIN_SUB
+ andi. L, T1, 31
+ ble CGEMM_L2x2_SAVE
+ b CGEMM_L2x2_SUB2
+
+
+CGEMM_L2x2_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 63
+ cmpwi T6,33
+#else
+ andi. L, K, 63
+ cmpwi K,33
+#endif
+ li T8,1
+ bne CMP2x2_32K
+ addi BO,BO,-16
+ addi AO,AO,-16
+ LOAD2x2O 16,16
+ END2x2_WITHOUT_ADD
+ LOAD2x2_2O 32, 32
+ mtctr T8
+ bl CGEMM_L2x2_K32
+ b CGEMM_L2x2_SAVE
+ CMP2x2_32K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,32
+#else
+ cmpwi K,32
+#endif
+ bne CGEMM_L2x2_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-32
+ addi AO,AO,-32
+ LOAD2x2_2O 32,32
+ bl CGEMM_L2x2_K32
+ b CGEMM_L2x2_SAVE
+ MY_ALIGN
+ MY_ALIGN
+
+
+CGEMM_L2x2_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble CGEMM_L2x2_SUB2_8
+ bl CGEMM_2x2_L16_SUB
+ MY_ALIGN
+
+
+CGEMM_L2x2_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble CGEMM_L2x2_SUB2_4
+ bl CGEMM_2x2_L8_SUB
+ MY_ALIGN
+
+
+CGEMM_L2x2_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble CGEMM_L2x2_SUB2_2
+ LOAD2x2_2
+ KERNEL2x2_L2 32,32, 0,0
+ KERNEL2x2_E2 32,32, 1,1
+ MY_ALIGN
+
+
+CGEMM_L2x2_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble CGEMM_L2x2_SUB2_1
+ LOAD2x2_2
+ KERNEL2x2_E2 32,32, 0,1
+ MY_ALIGN
+
+
+CGEMM_L2x2_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble CGEMM_L2x2_SAVE
+ KERNEL2x2
+
+ MY_ALIGN
+CGEMM_L2x2_SAVE:
+/*----------------------------------------*/
+ SAVE2x2
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,2
+#endif
+
+
+CGEMM_L2x2_END:
+/*----------------------------------------*/
+
+
+CGEMM_L2x1_BEGIN:
+/*----------------------------------------*/
+ andi. T1, M, 1
+ ble CGEMM_L2x1_END
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2
+#else
+ mr BO, B
+#endif
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,1,2
+ mr T1, T6
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(T1-2) % 32x */
+#else
+ mr T1, K
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(K-2) % 32x */
+#endif
+ ZERO2x1
+ ble CGEMM_L2x1_SUB0
+ bl CGEMM_2x1_LMAIN_SUB
+ andi. L, T1, 31
+ ble CGEMM_L2x1_SAVE
+ b CGEMM_L2x1_SUB2
+
+
+CGEMM_L2x1_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 63
+ cmpwi T6,33
+#else
+ andi. L, K, 63
+ cmpwi K,33
+#endif
+ li T8,1
+ bne CMP2x1_32K
+ addi BO,BO,-16
+ addi AO,AO,-8
+ LOAD2x1O 8,16
+ END2x1_WITHOUT_ADD
+ LOAD2x1_2O 16, 32
+ mtctr T8
+ bl CGEMM_L2x1_K32
+ b CGEMM_L2x1_SAVE
+ CMP2x1_32K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,32
+#else
+ cmpwi K,32
+#endif
+ bne CGEMM_L2x1_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-32
+ addi AO,AO,-16
+ LOAD2x1_2O 16,32
+ bl CGEMM_L2x1_K32
+ b CGEMM_L2x1_SAVE
+ MY_ALIGN
+ MY_ALIGN
+
+
+CGEMM_L2x1_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble CGEMM_L2x1_SUB2_8
+ bl CGEMM_2x1_L16_SUB
+ MY_ALIGN
+
+
+CGEMM_L2x1_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble CGEMM_L2x1_SUB2_4
+ bl CGEMM_2x1_L8_SUB
+ MY_ALIGN
+
+
+CGEMM_L2x1_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble CGEMM_L2x1_SUB2_2
+ LOAD2x1_2
+ KERNEL2x1_L2 16,32, 0,0
+ KERNEL2x1_E2 16,32, 1,1
+ MY_ALIGN
+
+
+CGEMM_L2x1_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble CGEMM_L2x1_SUB2_1
+ LOAD2x1_2
+ KERNEL2x1_E2 16,32, 0,1
+ MY_ALIGN
+
+
+CGEMM_L2x1_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble CGEMM_L2x1_SAVE
+ KERNEL2x1
+
+ MY_ALIGN
+CGEMM_L2x1_SAVE:
+/*----------------------------------------*/
+
+ SAVE2x1
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,2
+#endif
+
+
+CGEMM_L2x1_END:
+/*----------------------------------------*/
+ slwi T1, K, 4
+
+ add B, B, T1
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ addi TEMP_REG, TEMP_REG, 2
+#endif
+
+CGEMM_L2_END:
+
+
+b CGEMM_L1
+/* MINI SUBROUTINES */
+/* 1x8 MAIN 128x+2 LOOP */
+
+
+CGEMM_L1x8_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD1x8_2
+ MY_ALIGN
+CGEMM_L1x8_LOOP:
+/*----------------------------------------*/
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL1x8_L2 128,16,0,0
+CGEMM_L1x8_K128:
+/*----------------------------------------*/
+ KERNEL1x8_L2 128,16,1,0
+ dcbt AO, T2
+ KERNEL1x8_L2 128,16,2,0
+ KERNEL1x8_L2 128,16,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL1x8_L2 128,16,4,0
+ KERNEL1x8_L2 128,16,5,0
+ dcbt AO, T4
+ KERNEL1x8_L2 128,16,6,0
+ KERNEL1x8_L2 128,16,7,0
+ dcbt AO, T5
+ dcbt BO, T3
+ KERNEL1x8_L2 128,16,8,0
+ KERNEL1x8_L2 128,16,9,0
+ KERNEL1x8_L2 128,16,10,0
+ KERNEL1x8_L2 128,16,11,0
+ dcbt BO, T4
+ KERNEL1x8_L2 128,16,12,0
+ KERNEL1x8_L2 128,16,13,0
+ KERNEL1x8_L2 128,16,14,0
+ KERNEL1x8_L2 128,16,15,0
+ KERNEL1x8_L2 128,16,16,0
+ KERNEL1x8_L2 128,16,17,0
+ KERNEL1x8_L2 128,16,18,0
+ KERNEL1x8_L2 128,16,19,0
+ KERNEL1x8_L2 128,16,20,0
+ KERNEL1x8_L2 128,16,21,0
+ KERNEL1x8_L2 128,16,22,0
+ KERNEL1x8_L2 128,16,23,0
+ KERNEL1x8_L2 128,16,24,0
+ KERNEL1x8_L2 128,16,25,0
+ KERNEL1x8_L2 128,16,26,0
+ KERNEL1x8_L2 128,16,27,0
+ KERNEL1x8_L2 128,16,28,0
+ KERNEL1x8_L2 128,16,29,0
+ KERNEL1x8_L2 128,16,30,0
+ KERNEL1x8_L2 128,16,31,0
+ KERNEL1x8_L2 128,16,32,0
+ KERNEL1x8_L2 128,16,33,0
+ KERNEL1x8_L2 128,16,34,0
+ KERNEL1x8_L2 128,16,35,0
+ KERNEL1x8_L2 128,16,36,0
+ KERNEL1x8_L2 128,16,37,0
+ KERNEL1x8_L2 128,16,38,0
+ KERNEL1x8_L2 128,16,39,0
+ KERNEL1x8_L2 128,16,40,0
+ KERNEL1x8_L2 128,16,41,0
+ KERNEL1x8_L2 128,16,42,0
+ KERNEL1x8_L2 128,16,43,0
+ KERNEL1x8_L2 128,16,44,0
+ KERNEL1x8_L2 128,16,45,0
+ KERNEL1x8_L2 128,16,46,0
+ KERNEL1x8_L2 128,16,47,0
+ KERNEL1x8_L2 128,16,48,0
+ KERNEL1x8_L2 128,16,49,0
+ KERNEL1x8_L2 128,16,50,0
+ KERNEL1x8_L2 128,16,51,0
+ KERNEL1x8_L2 128,16,52,0
+ KERNEL1x8_L2 128,16,53,0
+ KERNEL1x8_L2 128,16,54,0
+ KERNEL1x8_L2 128,16,55,0
+ KERNEL1x8_L2 128,16,56,0
+ KERNEL1x8_L2 128,16,57,0
+ KERNEL1x8_L2 128,16,58,0
+ KERNEL1x8_L2 128,16,59,0
+ KERNEL1x8_L2 128,16,60,0
+ KERNEL1x8_L2 128,16,61,0
+ KERNEL1x8_L2 128,16,62,0
+ KERNEL1x8_L2 128,16,63,1
+ bdnz CGEMM_L1x8_LOOP
+ MY_ALIGN
+CGEMM_L1x8_LOOP_END:
+/*----------------------------------------*/
+ END1x8_2
+ blr
+ MY_ALIGN
+
+
+CGEMM_1x8_L64_SUB:
+/*----------------------------------------*/
+ LOAD1x8_2
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL1x8_L2 128,16,0,0
+ KERNEL1x8_L2 128,16,1,0
+ dcbt AO, T2
+ KERNEL1x8_L2 128,16,2,0
+ KERNEL1x8_L2 128,16,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL1x8_L2 128,16,4,0
+ KERNEL1x8_L2 128,16,5,0
+ dcbt AO, T4
+ KERNEL1x8_L2 128,16,6,0
+ KERNEL1x8_L2 128,16,7,0
+ dcbt AO, T5
+ dcbt BO, T3
+ KERNEL1x8_L2 128,16,8,0
+ KERNEL1x8_L2 128,16,9,0
+ KERNEL1x8_L2 128,16,10,0
+ KERNEL1x8_L2 128,16,11,0
+ dcbt BO, T4
+ KERNEL1x8_L2 128,16,12,0
+ KERNEL1x8_L2 128,16,13,0
+ KERNEL1x8_L2 128,16,14,0
+ KERNEL1x8_L2 128,16,15,0
+ KERNEL1x8_L2 128,16,16,0
+ KERNEL1x8_L2 128,16,17,0
+ KERNEL1x8_L2 128,16,18,0
+ KERNEL1x8_L2 128,16,19,0
+ KERNEL1x8_L2 128,16,20,0
+ KERNEL1x8_L2 128,16,21,0
+ KERNEL1x8_L2 128,16,22,0
+ KERNEL1x8_L2 128,16,23,0
+ KERNEL1x8_L2 128,16,24,0
+ KERNEL1x8_L2 128,16,25,0
+ KERNEL1x8_L2 128,16,26,0
+ KERNEL1x8_L2 128,16,27,0
+ KERNEL1x8_L2 128,16,28,0
+ KERNEL1x8_L2 128,16,29,0
+ KERNEL1x8_L2 128,16,30,0
+ KERNEL1x8_E2 128,16,31,1
+ blr
+ MY_ALIGN
+
+
+CGEMM_1x8_L32_SUB:
+/*----------------------------------------*/
+ LOAD1x8_2
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL1x8_L2 128,16,0,0
+ KERNEL1x8_L2 128,16,1,0
+ dcbt AO, T2
+ KERNEL1x8_L2 128,16,2,0
+ KERNEL1x8_L2 128,16,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL1x8_L2 128,16,4,0
+ KERNEL1x8_L2 128,16,5,0
+ dcbt AO, T4
+ KERNEL1x8_L2 128,16,6,0
+ KERNEL1x8_L2 128,16,7,0
+ dcbt AO, T5
+ dcbt BO, T3
+ KERNEL1x8_L2 128,16,8,0
+ KERNEL1x8_L2 128,16,9,0
+ KERNEL1x8_L2 128,16,10,0
+ KERNEL1x8_L2 128,16,11,0
+ dcbt BO, T4
+ KERNEL1x8_L2 128,16,12,0
+ KERNEL1x8_L2 128,16,13,0
+ KERNEL1x8_L2 128,16,14,0
+ KERNEL1x8_E2 128,16,15,1
+ blr
+ MY_ALIGN
+
+
+CGEMM_1x8_L16_SUB:
+/*----------------------------------------*/
+ LOAD1x8_2
+ dcbt AO, PRE
+ dcbt BO, PRE
+ KERNEL1x8_L2 128,16,0,0
+ KERNEL1x8_L2 128,16,1,0
+ dcbt AO, T2
+ KERNEL1x8_L2 128,16,2,0
+ KERNEL1x8_L2 128,16,3,0
+ dcbt AO, T3
+ dcbt BO, T2
+ KERNEL1x8_L2 128,16,4,0
+ KERNEL1x8_L2 128,16,5,0
+ dcbt AO, T4
+ KERNEL1x8_L2 128,16,6,0
+ KERNEL1x8_E2 128,16,7,1
+ blr
+ MY_ALIGN
+
+
+CGEMM_1x4_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD1x4_2
+ MY_ALIGN
+CGEMM_L1x4_LOOP:
+/*----------------------------------------*/
+ KERNEL1x4_L2 64,16,0,0
+CGEMM_L1x4_K32:
+/*----------------------------------------*/
+ KERNEL1x4_L2 64,16,1,0
+ KERNEL1x4_L2 64,16,2,0
+ KERNEL1x4_L2 64,16,3,0
+ KERNEL1x4_L2 64,16,4,0
+ KERNEL1x4_L2 64,16,5,0
+ KERNEL1x4_L2 64,16,6,0
+ KERNEL1x4_L2 64,16,7,0
+ KERNEL1x4_L2 64,16,8,0
+ KERNEL1x4_L2 64,16,9,0
+ KERNEL1x4_L2 64,16,10,0
+ KERNEL1x4_L2 64,16,11,0
+ KERNEL1x4_L2 64,16,12,0
+ KERNEL1x4_L2 64,16,13,0
+ KERNEL1x4_L2 64,16,14,0
+ KERNEL1x4_L2 64,16,15,1
+ bdnz CGEMM_L1x4_LOOP
+ MY_ALIGN
+CGEMM_L1x4_LOOP_END:
+/*----------------------------------------*/
+ END1x4_2
+ blr
+ MY_ALIGN
+
+
+CGEMM_1x4_L16_SUB:
+/*----------------------------------------*/
+ LOAD1x4_2
+ KERNEL1x4_L2 64,16,0,0
+ KERNEL1x4_L2 64,16,1,0
+ KERNEL1x4_L2 64,16,2,0
+ KERNEL1x4_L2 64,16,3,0
+ KERNEL1x4_L2 64,16,4,0
+ KERNEL1x4_L2 64,16,5,0
+ KERNEL1x4_L2 64,16,6,0
+ KERNEL1x4_E2 64,16,7,1
+ blr
+ MY_ALIGN
+
+
+CGEMM_1x4_L8_SUB:
+/*----------------------------------------*/
+ LOAD1x4_2
+ KERNEL1x4_L2 64,16,0,0
+ KERNEL1x4_L2 64,16,1,0
+ KERNEL1x4_L2 64,16,2,0
+ KERNEL1x4_E2 64,16,3,1
+ blr
+
+
+CGEMM_1x2_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD1x2_2
+ MY_ALIGN
+CGEMM_L1x2_LOOP:
+/*----------------------------------------*/
+ KERNEL1x2_L2 32,16,0,0
+CGEMM_L1x2_K32:
+/*----------------------------------------*/
+ KERNEL1x2_L2 32,16,1,0
+ KERNEL1x2_L2 32,16,2,0
+ KERNEL1x2_L2 32,16,3,0
+ KERNEL1x2_L2 32,16,4,0
+ KERNEL1x2_L2 32,16,5,0
+ KERNEL1x2_L2 32,16,6,0
+ KERNEL1x2_L2 32,16,7,0
+ KERNEL1x2_L2 32,16,8,0
+ KERNEL1x2_L2 32,16,9,0
+ KERNEL1x2_L2 32,16,10,0
+ KERNEL1x2_L2 32,16,11,0
+ KERNEL1x2_L2 32,16,12,0
+ KERNEL1x2_L2 32,16,13,0
+ KERNEL1x2_L2 32,16,14,0
+ KERNEL1x2_L2 32,16,15,1
+ bdnz CGEMM_L1x2_LOOP
+ MY_ALIGN
+
+
+CGEMM_L1x2_LOOP_END:
+/*----------------------------------------*/
+ END1x2_2
+ blr
+ MY_ALIGN
+CGEMM_1x2_L16_SUB:
+/*----------------------------------------*/
+ LOAD1x2_2
+ KERNEL1x2_L2 32,16,0,0
+ KERNEL1x2_L2 32,16,1,0
+ KERNEL1x2_L2 32,16,2,0
+ KERNEL1x2_L2 32,16,3,0
+ KERNEL1x2_L2 32,16,4,0
+ KERNEL1x2_L2 32,16,5,0
+ KERNEL1x2_L2 32,16,6,0
+ KERNEL1x2_E2 32,16,7,1
+ blr
+ MY_ALIGN
+CGEMM_1x2_L8_SUB:
+/*----------------------------------------*/
+ LOAD1x2_2
+ KERNEL1x2_L2 32,16,0,0
+ KERNEL1x2_L2 32,16,1,0
+ KERNEL1x2_L2 32,16,2,0
+ KERNEL1x2_E2 32,16,3,1
+ blr
+
+
+CGEMM_1x1_LMAIN_SUB:
+/*----------------------------------------*/
+ mtctr T8
+ LOAD1x1_2
+ MY_ALIGN
+CGEMM_L1x1_LOOP:
+/*----------------------------------------*/
+ KERNEL1x1_L2 16,16,0,0
+CGEMM_L1x1_K32:
+/*----------------------------------------*/
+ KERNEL1x1_L2 16,16,1,0
+ KERNEL1x1_L2 16,16,2,0
+ KERNEL1x1_L2 16,16,3,0
+ KERNEL1x1_L2 16,16,4,0
+ KERNEL1x1_L2 16,16,5,0
+ KERNEL1x1_L2 16,16,6,0
+ KERNEL1x1_L2 16,16,7,0
+ KERNEL1x1_L2 16,16,8,0
+ KERNEL1x1_L2 16,16,9,0
+ KERNEL1x1_L2 16,16,10,0
+ KERNEL1x1_L2 16,16,11,0
+ KERNEL1x1_L2 16,16,12,0
+ KERNEL1x1_L2 16,16,13,0
+ KERNEL1x1_L2 16,16,14,0
+ KERNEL1x1_L2 16,16,15,1
+ bdnz CGEMM_L1x1_LOOP
+ MY_ALIGN
+CGEMM_L1x1_LOOP_END:
+/*----------------------------------------*/
+ END1x1_2
+ blr
+
+ MY_ALIGN
+CGEMM_1x1_L16_SUB:
+/*----------------------------------------*/
+ LOAD1x1_2
+ KERNEL1x1_L2 16,16,0,0
+ KERNEL1x1_L2 16,16,1,0
+ KERNEL1x1_L2 16,16,2,0
+ KERNEL1x1_L2 16,16,3,0
+ KERNEL1x1_L2 16,16,4,0
+ KERNEL1x1_L2 16,16,5,0
+ KERNEL1x1_L2 16,16,6,0
+ KERNEL1x1_E2 16,16,7,1
+ blr
+ MY_ALIGN
+
+
+CGEMM_1x1_L8_SUB:
+/*----------------------------------------*/
+ LOAD1x1_2
+ KERNEL1x1_L2 16,16,0,0
+ KERNEL1x1_L2 16,16,1,0
+ KERNEL1x1_L2 16,16,2,0
+ KERNEL1x1_E2 16,16,3,1
+ blr
+
+
+
+/* MAIN LOOP BEGINS */
+ MY_ALIGN
+
+
+CGEMM_L1:
+/*----------------------------------------*/
+
+ andi. J, N, 1
+ ble CGEMM_L1_END
+
+CGEMM_L1_BEGIN:
+/*----------------------------------------*/
+ mr CO, C
+ add T2,C,LDC
+ mr AO, A
+ add C, C, T1
+#if defined(TRMMKERNEL) && defined(LEFT)
+ mr TEMP_REG, OFFSET /*off = offset;*/
+#endif
+ srawi. I, M, 3
+ ble CGEMM_L1x8_END
+ dcbt CO,r0 /*just prefetch*/
+ dcbt T2,r0
+
+
+CGEMM_L1x8_BEGIN:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1
+#else
+ mr BO, B
+ dcbt B, r0
+#endif
+ dcbt AO, r0
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,8,1
+ mr T1, T6
+/* TEMPS FOR PREFETCH */
+ li T2, 1024
+ li T3, 1024+512
+ addi T1,T1, -2
+/* TEMPS FOR PREFETCH */
+ li T4, 2048
+ li T5, 2048+512
+ srawi. T8, T1, 7 /**(T1-2) % 128x */
+#else
+ mr T1, K
+/* TEMPS FOR PREFETCH */
+ li T2, 1024
+ li T3, 1024+512
+ addi T1,T1, -2
+/* TEMPS FOR PREFETCH */
+ li T4, 2048
+ li T5, 2048+512
+ srawi. T8, T1, 7 /**(K-2) % 128x */
+#endif
+ ZERO1x8
+ ble CGEMM_L1x8_SUB0
+ bl CGEMM_L1x8_LMAIN_SUB
+ andi. L, T1, 127
+ ble CGEMM_L1x8_SAVE
+ b CGEMM_L1x8_SUB2
+
+
+CGEMM_L1x8_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 255
+ cmpwi T6,129
+#else
+ andi. L, K, 255
+ cmpwi K,129
+#endif
+ li T8,1
+ bne CMP1x8_128K
+ addi BO,BO,-8
+ addi AO,AO,-64
+ LOAD1x8O 64,8
+ END1x8_WITHOUT_ADD
+ LOAD1x8_2O 128, 16
+ mtctr T8
+ bl CGEMM_L1x8_K128
+ b CGEMM_L1x8_SAVE
+ CMP1x8_128K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,128
+#else
+ cmpwi K,128
+#endif
+ bne CGEMM_L1x8_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-16
+ addi AO,AO,-128
+ LOAD1x8_2O 128,16
+ bl CGEMM_L1x8_K128
+ b CGEMM_L1x8_SAVE
+ MY_ALIGN
+
+
+CGEMM_L1x8_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 64
+ ble CGEMM_L1x8_SUB2_32
+ bl CGEMM_1x8_L64_SUB
+ MY_ALIGN
+
+
+CGEMM_L1x8_SUB2_32:
+/*----------------------------------------*/
+ andi. T1,L, 32
+ ble CGEMM_L1x8_SUB2_16
+ bl CGEMM_1x8_L32_SUB
+ MY_ALIGN
+
+
+CGEMM_L1x8_SUB2_16:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble CGEMM_L1x8_SUB2_8
+ bl CGEMM_1x8_L16_SUB
+ MY_ALIGN
+
+
+CGEMM_L1x8_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble CGEMM_L1x8_SUB2_4
+ LOAD1x8_2
+ KERNEL1x8_L2 128,16, 0,0
+ KERNEL1x8_L2 128,16, 1,0
+ KERNEL1x8_L2 128,16, 2,0
+ KERNEL1x8_E2 128,16, 3,1
+ MY_ALIGN
+
+
+CGEMM_L1x8_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble CGEMM_L1x8_SUB2_2
+ LOAD1x8_2
+ KERNEL1x8_L2 128,16, 0,0
+ KERNEL1x8_E2 128,16, 1,1
+ MY_ALIGN
+
+
+CGEMM_L1x8_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble CGEMM_L1x8_SUB2_1
+ LOAD1x8_2
+ KERNEL1x8_E2 128,16, 0,1
+ MY_ALIGN
+
+
+CGEMM_L1x8_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble CGEMM_L1x8_SAVE
+ KERNEL1x8
+
+ MY_ALIGN
+CGEMM_L1x8_SAVE:
+/*----------------------------------------*/
+ addic. I, I, -1
+ MY_ALIGN
+ SAVE1x8
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,1
+#endif
+ bgt CGEMM_L1x8_BEGIN
+ andi. T2, M, 7
+ ble CGEMM_L1x1_END
+ andi. T1, M, 4
+ ble CGEMM_L1x4_END
+ b CGEMM_L1x4_BEGIN
+ MY_ALIGN
+
+
+CGEMM_L1x8_END:
+/*----------------------------------------*/
+
+
+CGEMM_L1x4_BEGIN:
+/*----------------------------------------*/
+ andi. T2, M, 7
+ ble CGEMM_L1x1_END
+ andi. T1, M, 4
+ ble CGEMM_L1x4_END
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1
+#else
+ mr BO, B
+#endif
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,4,1
+ mr T1, T6
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(T1-2) % 31x */
+#else
+ mr T1, K
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(K-2) % 31x */
+#endif
+ ZERO1x4
+ ble CGEMM_L1x4_SUB0
+ bl CGEMM_1x4_LMAIN_SUB
+ andi. L, T1, 31
+ ble CGEMM_L1x4_SAVE
+ b CGEMM_L1x4_SUB2
+
+
+CGEMM_L1x4_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 63
+ cmpwi T6,33
+#else
+ andi. L, K, 63
+ cmpwi K,33
+#endif
+ li T8,1
+ bne CMP1x4_32K
+ addi BO,BO,-8
+ addi AO,AO,-32
+ LOAD1x4O 32,8
+ END1x4_WITHOUT_ADD
+ LOAD1x4_2O 64, 16
+ mtctr T8
+ bl CGEMM_L1x4_K32
+ b CGEMM_L1x4_SAVE
+ CMP1x4_32K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,32
+#else
+ cmpwi K,32
+#endif
+ bne CGEMM_L1x4_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-16
+ addi AO,AO,-64
+ LOAD1x4_2O 64,16
+ bl CGEMM_L1x4_K32
+ b CGEMM_L1x4_SAVE
+ MY_ALIGN
+ MY_ALIGN
+
+
+CGEMM_L1x4_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble CGEMM_L1x4_SUB2_8
+ bl CGEMM_1x4_L16_SUB
+ MY_ALIGN
+
+
+CGEMM_L1x4_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble CGEMM_L1x4_SUB2_4
+ bl CGEMM_1x4_L8_SUB
+ MY_ALIGN
+
+
+CGEMM_L1x4_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble CGEMM_L1x4_SUB2_2
+ LOAD1x4_2
+ KERNEL1x4_L2 64,16, 0,0
+ KERNEL1x4_E2 64,16, 1,1
+ MY_ALIGN
+
+
+CGEMM_L1x4_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble CGEMM_L1x4_SUB2_1
+ LOAD1x4_2
+ KERNEL1x4_E2 64,16, 0,1
+ MY_ALIGN
+
+
+CGEMM_L1x4_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble CGEMM_L1x4_SAVE
+ KERNEL1x4
+
+
+CGEMM_L1x4_SAVE:
+/*----------------------------------------*/
+ SAVE1x4
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,1
+#endif
+
+
+CGEMM_L1x4_END:
+/*----------------------------------------*/
+
+
+CGEMM_L1x2_BEGIN:
+/*----------------------------------------*/
+ andi. T1, M, 2
+ ble CGEMM_L1x2_END
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1
+#else
+ mr BO, B
+#endif
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,2,1
+ mr T1, T6
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(T1-2) % 31x */
+#else
+ mr T1, K
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(K-2) % 31x */
+#endif
+ ZERO1x2
+ ble CGEMM_L1x2_SUB0
+ bl CGEMM_1x2_LMAIN_SUB
+ andi. L, T1, 31
+ ble CGEMM_L1x2_SAVE
+ b CGEMM_L1x2_SUB2
+
+
+CGEMM_L1x2_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 63
+ cmpwi T6,33
+#else
+ andi. L, K, 63
+ cmpwi K,33
+#endif
+ li T8,1
+ bne CMP1x2_32K
+ addi BO,BO,-8
+ addi AO,AO,-16
+ LOAD1x2O 16,8
+ END1x2_WITHOUT_ADD
+ LOAD1x2_2O 32, 16
+ mtctr T8
+ bl CGEMM_L1x2_K32
+ b CGEMM_L1x2_SAVE
+ CMP1x2_32K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,32
+#else
+ cmpwi K,32
+#endif
+ bne CGEMM_L1x2_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-16
+ addi AO,AO,-32
+ LOAD1x2_2O 32,16
+ bl CGEMM_L1x2_K32
+ b CGEMM_L1x2_SAVE
+ MY_ALIGN
+ MY_ALIGN
+
+
+CGEMM_L1x2_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble CGEMM_L1x2_SUB2_8
+ bl CGEMM_1x2_L16_SUB
+ MY_ALIGN
+
+
+CGEMM_L1x2_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble CGEMM_L1x2_SUB2_4
+ bl CGEMM_1x2_L8_SUB
+ MY_ALIGN
+
+
+CGEMM_L1x2_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble CGEMM_L1x2_SUB2_2
+ LOAD1x2_2
+ KERNEL1x2_L2 32,16, 0,0
+ KERNEL1x2_E2 32,16, 1,1
+ MY_ALIGN
+
+
+CGEMM_L1x2_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble CGEMM_L1x2_SUB2_1
+ LOAD1x2_2
+ KERNEL1x2_E2 32,16, 0,1
+ MY_ALIGN
+
+
+CGEMM_L1x2_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble CGEMM_L1x2_SAVE
+ KERNEL1x2
+
+ MY_ALIGN
+CGEMM_L1x2_SAVE:
+/*----------------------------------------*/
+ SAVE1x2
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,1
+#endif
+
+
+CGEMM_L1x2_END:
+/*----------------------------------------*/
+
+
+CGEMM_L1x1_BEGIN:
+/*----------------------------------------*/
+ andi. T1, M, 1
+ ble CGEMM_L1x1_END
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1
+#else
+ mr BO, B
+#endif
+#if defined(TRMMKERNEL)
+ REFRESH_TEMP_BK T6,K,TEMP_REG,1,1
+ mr T1, T6
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(T1-2) % 31x */
+#else
+ mr T1, K
+ addi T1,T1, -2
+ srawi. T8, T1, 5 /**(K-2) % 31x */
+#endif
+ ZERO1x1
+ ble CGEMM_L1x1_SUB0
+ bl CGEMM_1x1_LMAIN_SUB
+ andi. L, T1, 31
+ ble CGEMM_L1x1_SAVE
+ b CGEMM_L1x1_SUB2
+
+
+CGEMM_L1x1_SUB0:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ andi. L, T6, 63
+ cmpwi T6,33
+#else
+ andi. L, K, 63
+ cmpwi K,33
+#endif
+ li T8,1
+ bne CMP1x1_32K
+ addi BO,BO,-8
+ addi AO,AO,-8
+ LOAD1x1O 8,8
+ END1x1_WITHOUT_ADD
+ LOAD1x1_2O 16, 16
+ mtctr T8
+ bl CGEMM_L1x1_K32
+ b CGEMM_L1x1_SAVE
+ CMP1x1_32K:
+/*----------------------------------------*/
+#if defined(TRMMKERNEL)
+ cmpwi T6,32
+#else
+ cmpwi K,32
+#endif
+ bne CGEMM_L1x1_SUB2
+ MY_ALIGN
+ mtctr T8
+ addi BO,BO,-16
+ addi AO,AO,-16
+ LOAD1x1_2O 16,16
+ bl CGEMM_L1x1_K32
+ b CGEMM_L1x1_SAVE
+ MY_ALIGN
+ MY_ALIGN
+
+
+CGEMM_L1x1_SUB2:
+/*----------------------------------------*/
+ andi. T1,L, 16
+ ble CGEMM_L1x1_SUB2_8
+ bl CGEMM_1x1_L16_SUB
+ MY_ALIGN
+
+
+CGEMM_L1x1_SUB2_8:
+/*----------------------------------------*/
+ andi. T1,L, 8
+ ble CGEMM_L1x1_SUB2_4
+ bl CGEMM_1x1_L8_SUB
+ MY_ALIGN
+
+
+CGEMM_L1x1_SUB2_4:
+/*----------------------------------------*/
+ andi. T1,L, 4
+ ble CGEMM_L1x1_SUB2_2
+ LOAD1x1_2
+ KERNEL1x1_L2 16,16, 0,0
+ KERNEL1x1_E2 16,16, 1,1
+ MY_ALIGN
+
+
+CGEMM_L1x1_SUB2_2:
+/*----------------------------------------*/
+ andi. T1,L, 2
+ ble CGEMM_L1x1_SUB2_1
+ LOAD1x1_2
+ KERNEL1x1_E2 16,16, 0,1
+ MY_ALIGN
+
+
+CGEMM_L1x1_SUB2_1:
+/*----------------------------------------*/
+ andi. T1,L, 1
+ ble CGEMM_L1x1_SAVE
+ KERNEL1x1
+
+ MY_ALIGN
+CGEMM_L1x1_SAVE:
+/*----------------------------------------*/
+
+ SAVE1x1
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,1
+#endif
+
+
+CGEMM_L1x1_END:
+/*----------------------------------------*/
+ slwi T1, K, 3
+
+ add B, B, T1
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ addi TEMP_REG, TEMP_REG, 1
+#endif
+
+CGEMM_L1_END:
+
+
+
+
--- /dev/null
+/***************************************************************************
+Copyright (c) 2013-2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define unit_size 8
+#define DISP32(ind, disp) (ind*unit_size*32+disp)
+#define DISP16(ind, disp) (ind*unit_size*16+disp)
+#define DISP8(ind, disp) (ind*unit_size*8+disp)
+#define DISP4(ind, disp) (ind*unit_size*4+disp)
+#define DISP2(ind, disp) (ind*unit_size*2+disp)
+#define DISP1(ind, disp) (ind*unit_size+disp)
+#define DISPX(disp) (disp)
+
+.macro AGGREGATE_REALS_IMAGES VSINR_OUT1, VSINR, VSINI_OUT2, VSINI
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ xvsubsp \VSINR_OUT1, \VSINR_OUT1, \VSINR
+ xvaddsp \VSINI_OUT2, \VSINI_OUT2, \VSINI
+#elif defined(CN) || defined(CT) || defined(RN) || defined(RT)
+ xvaddsp \VSINR_OUT1, \VSINR_OUT1, \VSINR
+ xvsubsp \VSINI_OUT2, \VSINI_OUT2, \VSINI
+#elif defined(NC) || defined(TC) || defined(NR) || defined(TR)
+ xvaddsp \VSINR_OUT1, \VSINR_OUT1, \VSINR
+ xvsubsp \VSINI_OUT2, \VSINI, \VSINI_OUT2
+#else // CC || CR || RC || RR
+ /*we will assume {-alpha_r,-alpha_i} for this case */
+ /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/
+ xvsubsp \VSINR_OUT1, \VSINR, \VSINR_OUT1
+ /*we will negate alpha image instead to fix sign*/
+ xvaddsp \VSINI_OUT2, \VSINI_OUT2, \VSINI
+#endif
+.endm
+
+.macro AGGREGATE_REALS_IMAGES_A_PERMUTE VSINR_OUT1, VSINR, VSINI_OUT2, VSINI
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ xvsubsp \VSINR_OUT1, \VSINR_OUT1, \VSINR
+ xvaddsp \VSINI_OUT2, \VSINI_OUT2, \VSINI
+#elif defined(CN) || defined(CT) || defined(RN) || defined(RT)
+ xvaddsp \VSINR_OUT1, \VSINR_OUT1, \VSINR
+ xvsubsp \VSINI_OUT2, \VSINI, \VSINI_OUT2
+#elif defined(NC) || defined(TC) || defined(NR) || defined(TR)
+ xvaddsp \VSINR_OUT1, \VSINR_OUT1, \VSINR
+ xvsubsp \VSINI_OUT2, \VSINI_OUT2, \VSINI
+#else // CC || CR || RC || RR
+ /*we will assume {-alpha_r,-alpha_i} for this case */
+ /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/
+ xvsubsp \VSINR_OUT1, \VSINR, \VSINR_OUT1
+ /*we will negate alpha image instead to fix sign*/
+ xvaddsp \VSINI_OUT2, \VSINI_OUT2, \VSINI
+#endif
+.endm
+
+/* {i0,i1} * {alpha_i,alpha_i} [- VSOUT1] ;[VSOUT2 +] {r0,r1}*{alpha_i,alpha_i} */
+
+.macro MULT_APLHA_PART1 VSINRR, VSINII, VSOUT1, VSOUT2
+ xvmulsp \VSOUT1, \VSINII, alpha_i
+ xvmulsp \VSOUT2, \VSINRR, alpha_i
+.endm
+
+/* {r0,r1} * {alpha_r,alpha_r} - VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */
+
+.macro MULT_APLHA_PART2 VSINRR, VSINII, VSOUT1, VSOUT2
+ xvmsubasp \VSOUT1, \VSINRR, alpha_r
+ xvmaddasp \VSOUT2, \VSINII, alpha_r
+.endm
+
+.macro PERMUTE1 OUT, R1, R2, R3, R4
+ xxsel vs62, \R1, \R2, vs57
+ xxsel \OUT, \R3, \R4, vs57
+ xxpermdi \OUT, \OUT, vs62, 1
+.endm
+.macro PERMUTE2 OUT, R1, R2, R3, R4
+ xxsel vs62, \R2, \R1, vs57
+ xxsel \OUT, \R4, \R3, vs57
+ xxpermdi \OUT, vs62, \OUT, 1
+ xxperm \OUT, \OUT, permute_mask
+.endm
+.macro PERMUTE3 OUT, R1, R2, R3, R4
+ xxsel vs62, \R1, \R2, vs57
+ xxsel \OUT, \R3, \R4, vs57
+ xxpermdi \OUT, vs62, \OUT, 2
+.endm
+.macro PERMUTE4 OUT, R1, R2, R3, R4
+ xxsel vs62, \R2, \R1, vs57
+ xxsel \OUT, \R4, \R3, vs57
+ xxpermdi \OUT, \OUT, vs62, 2
+ xxperm \OUT, \OUT, permute_mask
+.endm
+.macro GROUP1
+ xxperm vs0, vs32, permute_mask
+ xxperm vs4, vs40, permute_mask
+ xxperm vs1, vs33, permute_mask
+ xxperm vs5, vs41, permute_mask
+ xxperm vs8, vs36, permute_mask
+ xxperm vs12, vs44, permute_mask
+ xxperm vs9, vs37, permute_mask
+ xxperm vs13, vs45, permute_mask
+.endm
+.macro AGG_GROUP1
+ AGGREGATE_REALS_IMAGES vs32, vs0, vs40, vs4
+ AGGREGATE_REALS_IMAGES vs33, vs1, vs41, vs5
+ AGGREGATE_REALS_IMAGES vs36, vs8, vs44, vs12
+ AGGREGATE_REALS_IMAGES vs37, vs9, vs45, vs13
+.endm
+.macro GROUP2
+ xxperm vs0, vs34, permute_mask
+ xxperm vs4, vs42, permute_mask
+ xxperm vs1, vs35, permute_mask
+ xxperm vs5, vs43, permute_mask
+ xxperm vs8, vs38, permute_mask
+ xxperm vs12, vs46, permute_mask
+ xxperm vs9, vs39, permute_mask
+ xxperm vs13, vs47, permute_mask
+.endm
+.macro AGG_GROUP2
+ AGGREGATE_REALS_IMAGES vs34, vs0, vs42, vs4
+ AGGREGATE_REALS_IMAGES vs35, vs1, vs43, vs5
+ AGGREGATE_REALS_IMAGES vs38, vs8, vs46, vs12
+ AGGREGATE_REALS_IMAGES vs39, vs9, vs47, vs13
+.endm
+.macro MULTIPLY_GROUP1
+ MULT_APLHA_PART1 vs32, vs40, vs0, vs1
+ MULT_APLHA_PART1 vs33, vs41, vs2, vs3
+ MULT_APLHA_PART1 vs36, vs44, vs8, vs9
+ MULT_APLHA_PART1 vs37, vs45, vs10, vs11
+ MULT_APLHA_PART2 vs32, vs40, vs0, vs1
+ MULT_APLHA_PART2 vs33, vs41, vs2, vs3
+ MULT_APLHA_PART2 vs36, vs44, vs8, vs9
+ MULT_APLHA_PART2 vs37, vs45, vs10, vs11
+.endm
+.macro MULTIPLY_GROUP2
+ MULT_APLHA_PART1 vs34, vs42, vs4, vs5
+ MULT_APLHA_PART1 vs35, vs43, vs6, vs7
+ MULT_APLHA_PART1 vs38, vs46, vs12, vs13
+ MULT_APLHA_PART1 vs39, vs47, vs14, vs15
+ MULT_APLHA_PART2 vs34, vs42, vs4, vs5
+ MULT_APLHA_PART2 vs35, vs43, vs6, vs7
+ MULT_APLHA_PART2 vs38, vs46, vs12, vs13
+ MULT_APLHA_PART2 vs39, vs47, vs14, vs15
+.endm
+/* reconstruct r, i pairs*/
+.macro RECONSTRUCT_PAIR1
+ xxperm vs0, vs1, save_permute_1
+ xxperm vs2, vs3, save_permute_1
+ xxperm vs8, vs9, save_permute_1
+ xxperm vs10, vs11, save_permute_1
+.endm
+.macro RECONSTRUCT_PAIR2
+ xxperm vs4, vs5, save_permute_1
+ xxperm vs6, vs7, save_permute_1
+ xxperm vs12, vs13, save_permute_1
+ xxperm vs14, vs15, save_permute_1
+.endm
+.macro SHUFFLE_ACC ACC, R0, R1, R2, R3, O1, O2, O3, O4
+ xxmfacc \ACC
+ PERMUTE1 \O1, \R3, \R2, \R1, \R0
+ PERMUTE2 \O2, \R1, \R0, \R3, \R2
+ PERMUTE3 \O3, \R1, \R0, \R3, \R2
+ PERMUTE4 \O4, \R3, \R2, \R1, \R0
+.endm
+/* macros for N=4 and M=8
+**********************************************************************************************/
+.macro ZERO4x8
+ xxsetaccz 0
+ xxsetaccz 1
+ xxsetaccz 2
+ xxsetaccz 3
+ xxsetaccz 4
+ xxsetaccz 5
+ xxsetaccz 6
+ xxsetaccz 7
+.endm
+
+.macro LOAD4x8
+ LOAD4x8O 0, 0
+.endm
+
+.macro LOAD4x8O OffsetA, OffsetB
+ lxvp vs34, (\OffsetB+0)(BO)
+ lxvp vs32, (\OffsetA+0)(AO)
+ lxvp vs36, (\OffsetA+32)(AO)
+.endm
+
+.macro END4x8_NORMAL
+ END4x8 AO, BO, 64, 32
+.endm
+
+.macro END4x8_WITHOUT_ADD
+ END4x8 AO, BO, 0, 0
+.endm
+
+.macro END4x8 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+ xvf32gerpp 3, 36, 35
+ xvf32gerpp 2, 37, 35
+ xvf32gerpp 1, 32, 35
+ xvf32gerpp 0, 33, 35
+ xvf32gerpp 7, 36, 34
+ xvf32gerpp 6, 37, 34
+ xvf32gerpp 5, 32, 34
+ xvf32gerpp 4, 33, 34
+.endm
+
+.macro LOAD4x8_2
+ LOAD4x8_2O 0, 0
+.endm
+
+.macro LOAD4x8_2O OffsetA, OffsetB
+ lxvp vs34, (\OffsetB)(BO)
+ lxvp vs38, (32+\OffsetB)(BO)
+ lxvp vs32, (0+\OffsetA)(AO)
+ lxvp vs36, (32+\OffsetA)(AO)
+ lxvp vs40, (64+\OffsetA)(AO)
+ lxvp vs42, (64+32+\OffsetA)(AO)
+.endm
+
+.macro END4x8_2
+ /*for load2 offset will be 128 and 64*/
+ KERNEL4x8_2 AO, BO, 128, 64, 0, 1, 1
+.endm
+
+.macro KERNEL4x8_E2 OffsetA, OffsetB, Index, IsLast
+ KERNEL4x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
+.endm
+
+.macro KERNEL4x8_L2 OffsetA, OffsetB, Index, IsLast
+ KERNEL4x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
+.endm
+
+.macro KERNEL4x8_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+ xvf32gerpp 3, 36, 35
+ xvf32gerpp 2, 37, 35
+ xvf32gerpp 1, 32, 35
+ xvf32gerpp 0, 33, 35
+ xvf32gerpp 7, 36, 34
+ xvf32gerpp 6, 37, 34
+ xvf32gerpp 5, 32, 34
+ xvf32gerpp 4, 33, 34
+.if \Complete==0
+ lxvp vs34, DISP8(\Index, \OffsetB)(\BREG)
+ lxvp vs32, DISP16(\Index, 0+\OffsetA)(\AREG)
+ lxvp vs36, DISP16(\Index, 32+\OffsetA)(\AREG)
+.endif
+ xvf32gerpp 3, 42, 39
+ xvf32gerpp 2, 43, 39
+ xvf32gerpp 1, 40, 39
+ xvf32gerpp 0, 41, 39
+ xvf32gerpp 7, 42, 38
+ xvf32gerpp 6, 43, 38
+ xvf32gerpp 5, 40, 38
+ xvf32gerpp 4, 41, 38
+.if \Complete==0
+ lxvp vs40, DISP16(\Index, 64+\OffsetA)(\AREG)
+ lxvp vs38, DISP8(\Index, 32+\OffsetB)(\BREG)
+ lxvp vs42, DISP16(\Index, 64+32+\OffsetA)(\AREG)
+.endif
+.if \IsLast==1
+.if \Complete==1
+ addi \BREG, \BREG, DISP8(\Index, \OffsetB)
+ addi \AREG, \AREG, DISP16(\Index, \OffsetA)
+.else
+ addi \BREG, \BREG, DISP8(\Index, 64)
+ addi \AREG, \AREG, DISP16(\Index, 128)
+.endif
+.endif
+.endm
+
+.macro KERNEL4x8
+ LOAD4x8
+ END4x8 AO, BO, 64, 32
+.endm
+
+.macro SAVE4x8
+ SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
+ SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45
+ SHUFFLE_ACC 2, vs8, vs9, vs10, vs11, vs34, vs42, vs38, vs46
+ SHUFFLE_ACC 3, vs12, vs13, vs14, vs15, vs35, vs43, vs39, vs47
+ SHUFFLE_ACC 4, vs16, vs17, vs18, vs19, vs48, vs56, vs52, vs60
+ SHUFFLE_ACC 5, vs20, vs21, vs22, vs23, vs49, vs16, vs53, vs61
+ SHUFFLE_ACC 7, vs28, vs29, vs30, vs31, vs17, vs19, vs18, vs20
+ SHUFFLE_ACC 6, vs24, vs25, vs26, vs27, vs50, vs58, vs54, vs21
+ add T4, LDC, LDC
+ add T1, CO, LDC
+#ifndef TRMMKERNEL
+ lxvp vs24, 0(CO)
+#endif
+#ifndef TRMMKERNEL
+ lxvp vs26, 32(CO)
+#endif
+#ifndef TRMMKERNEL
+ lxvp vs28, 0(T1)
+#endif
+ xxperm vs2, vs34, permute_mask
+ xxperm vs6, vs42, permute_mask
+#ifndef TRMMKERNEL
+ lxvp vs30, 32(T1)
+#endif
+ xxperm vs3, vs35, permute_mask
+ xxperm vs7, vs43, permute_mask
+ add T2, CO, T4
+ add T3, T1, T4
+ GROUP1
+ AGG_GROUP1
+ AGGREGATE_REALS_IMAGES vs34, vs2, vs42, vs6
+ xxperm vs10, vs38, permute_mask
+ xxperm vs14, vs46, permute_mask
+ AGGREGATE_REALS_IMAGES vs35, vs3, vs43, vs7
+ xxperm vs11, vs39, permute_mask
+ xxperm vs15, vs47, permute_mask
+ xxperm vs0, vs48, permute_mask
+ xxperm vs4, vs56, permute_mask
+ xxperm vs1, vs49, permute_mask
+ xxperm vs5, vs16, permute_mask
+ AGGREGATE_REALS_IMAGES vs38, vs10, vs46, vs14
+ xxperm vs2, vs50, permute_mask
+ xxperm vs6, vs58, permute_mask
+ AGGREGATE_REALS_IMAGES vs39, vs11, vs47, vs15
+ xxperm vs3, vs17, permute_mask
+ xxperm vs7, vs19, permute_mask
+ AGGREGATE_REALS_IMAGES vs48, vs0, vs56, vs4
+ xxperm vs8, vs52, permute_mask
+ xxperm vs12, vs60, permute_mask
+ AGGREGATE_REALS_IMAGES vs49, vs1, vs16, vs5
+ xxperm vs9, vs53, permute_mask
+ xxperm vs13, vs61, permute_mask
+ AGGREGATE_REALS_IMAGES vs50, vs2, vs58, vs6
+ xxperm vs10, vs54, permute_mask
+ xxperm vs14, vs21, permute_mask
+ AGGREGATE_REALS_IMAGES vs17, vs3, vs19, vs7
+ xxperm vs11, vs18, permute_mask
+ xxperm vs15, vs20, permute_mask
+ AGGREGATE_REALS_IMAGES vs52, vs8, vs60, vs12
+ AGGREGATE_REALS_IMAGES vs53, vs9, vs61, vs13
+/*VSINRR, VSINII, VSOUT1, VSOUT2*/
+ MULT_APLHA_PART1 vs32, vs40, vs0, vs1
+ AGGREGATE_REALS_IMAGES vs54, vs10, vs21, vs14
+ MULT_APLHA_PART1 vs33, vs41, vs2, vs3
+ AGGREGATE_REALS_IMAGES vs18, vs11, vs20, vs15
+ MULT_APLHA_PART1 vs34, vs42, vs4, vs5
+ MULT_APLHA_PART1 vs35, vs43, vs6, vs7
+ MULT_APLHA_PART2 vs32, vs40, vs0, vs1
+ MULT_APLHA_PART2 vs33, vs41, vs2, vs3
+ MULT_APLHA_PART2 vs34, vs42, vs4, vs5
+ MULT_APLHA_PART2 vs35, vs43, vs6, vs7
+#ifndef TRMMKERNEL
+ lxvp vs32, 0(T2)
+#endif
+ MULT_APLHA_PART1 vs36, vs44, vs8, vs9
+ MULT_APLHA_PART1 vs37, vs45, vs10, vs11
+#ifndef TRMMKERNEL
+ lxvp vs40, 32(T2)
+#endif
+ MULT_APLHA_PART1 vs38, vs46, vs12, vs13
+ MULT_APLHA_PART1 vs39, vs47, vs14, vs15
+#ifndef TRMMKERNEL
+ lxvp vs34, 0(T3)
+#endif
+ MULT_APLHA_PART2 vs36, vs44, vs8, vs9
+ MULT_APLHA_PART2 vs37, vs45, vs10, vs11
+#ifndef TRMMKERNEL
+ lxvp vs42, 32(T3)
+#endif
+ MULT_APLHA_PART2 vs38, vs46, vs12, vs13
+ MULT_APLHA_PART2 vs39, vs47, vs14, vs15
+ RECONSTRUCT_PAIR1
+ RECONSTRUCT_PAIR2
+#ifndef TRMMKERNEL
+ /* add */
+ xxpermdi vs1, vs8, vs0, 2
+ xxpermdi vs3, vs10, vs2, 2
+ xxpermdi vs5, vs12, vs4, 2
+ xxpermdi vs7, vs14, vs6, 2
+ xxpermdi vs9, vs0, vs8, 2
+ xxpermdi vs11, vs2, vs10, 2
+ xvaddsp vs24, vs24, vs3
+ xvaddsp vs25, vs25, vs1
+ xxpermdi vs13, vs4, vs12, 2
+ xxpermdi vs15, vs6, vs14, 2
+ xvaddsp vs26, vs26, vs7
+ xvaddsp vs27, vs27, vs5
+ xvaddsp vs28, vs28, vs11
+ xvaddsp vs29, vs29, vs9
+ xvaddsp vs30, vs30, vs15
+ xvaddsp vs31, vs31, vs13
+#else
+ xxpermdi vs25, vs8, vs0, 2
+ xxpermdi vs24, vs10, vs2, 2
+ xxpermdi vs27, vs12, vs4, 2
+ xxpermdi vs26, vs14, vs6, 2
+ xxpermdi vs29, vs0, vs8, 2
+ xxpermdi vs28, vs2, vs10, 2
+ xxpermdi vs31, vs4, vs12, 2
+ xxpermdi vs30, vs6, vs14, 2
+#endif
+ stxvp vs24, 0(CO)
+ MULT_APLHA_PART1 vs48, vs56, vs0, vs1
+ MULT_APLHA_PART1 vs49, vs16, vs2, vs3
+ stxvp vs26, 32(CO)
+ MULT_APLHA_PART1 vs50, vs58, vs4, vs5
+ MULT_APLHA_PART1 vs17, vs19, vs6, vs7
+ stxvp vs28, 0(T1)
+ MULT_APLHA_PART2 vs48, vs56, vs0, vs1
+ MULT_APLHA_PART2 vs49, vs16, vs2, vs3
+ stxvp vs30, 32(T1)
+ MULT_APLHA_PART2 vs50, vs58, vs4, vs5
+ MULT_APLHA_PART2 vs17, vs19, vs6, vs7
+ MULT_APLHA_PART1 vs52, vs60, vs8, vs9
+ MULT_APLHA_PART1 vs53, vs61, vs10, vs11
+ MULT_APLHA_PART1 vs54, vs21, vs12, vs13
+ MULT_APLHA_PART1 vs18, vs20, vs14, vs15
+ MULT_APLHA_PART2 vs52, vs60, vs8, vs9
+ MULT_APLHA_PART2 vs53, vs61, vs10, vs11
+ MULT_APLHA_PART2 vs54, vs21, vs12, vs13
+ MULT_APLHA_PART2 vs18, vs20, vs14, vs15
+ RECONSTRUCT_PAIR1
+ RECONSTRUCT_PAIR2
+#ifndef TRMMKERNEL
+ /* add */
+ xxpermdi vs1, vs8, vs0, 2
+ xxpermdi vs3, vs10, vs2, 2
+ xxpermdi vs5, vs12, vs4, 2
+ xxpermdi vs7, vs14, vs6, 2
+ xxpermdi vs9, vs0, vs8, 2
+ xxpermdi vs11, vs2, vs10, 2
+ xvaddsp vs32, vs32, vs3
+ xvaddsp vs33, vs33, vs1
+ xxpermdi vs13, vs4, vs12, 2
+ xxpermdi vs15, vs6, vs14, 2
+ xvaddsp vs40, vs40, vs7
+ xvaddsp vs41, vs41, vs5
+ xvaddsp vs34, vs34, vs11
+ xvaddsp vs35, vs35, vs9
+ xvaddsp vs42, vs42, vs15
+ xvaddsp vs43, vs43, vs13
+#else
+ xxpermdi vs33, vs8, vs0, 2
+ xxpermdi vs32, vs10, vs2, 2
+ xxpermdi vs41, vs12, vs4, 2
+ xxpermdi vs40, vs14, vs6, 2
+ xxpermdi vs35, vs0, vs8, 2
+ xxpermdi vs34, vs2, vs10, 2
+ xxpermdi vs43, vs4, vs12, 2
+ xxpermdi vs42, vs6, vs14, 2
+#endif
+ stxvp vs32, 0(T2)
+ stxvp vs40, 32(T2)
+ stxvp vs34, 0(T3)
+ stxvp vs42, 32(T3)
+ addi CO, CO, 64
+.endm
+
+/* macros for N=4 and M=4
+**********************************************************************************************/
+
+.macro ZERO4x4
+ xxsetaccz 0
+ xxsetaccz 1
+ xxsetaccz 2
+ xxsetaccz 3
+.endm
+
+.macro LOAD4x4
+ LOAD4x4O 0, 0
+.endm
+
+.macro LOAD4x4O OffsetA, OffsetB
+ lxvp vs34, (\OffsetB+0)(BO)
+ lxvp vs32, (\OffsetA+0)(AO)
+.endm
+
+.macro END4x4_NORMAL
+ END4x4 AO, BO, 32, 32
+.endm
+
+.macro END4x4_WITHOUT_ADD
+ END4x4 AO, BO, 0, 0
+.endm
+
+.macro END4x4 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+ xvf32gerpp 3, 32, 34
+ xvf32gerpp 2, 33, 34
+ xvf32gerpp 1, 32, 35
+ xvf32gerpp 0, 33, 35
+.endm
+
+.macro LOAD4x4_2
+ LOAD4x4_2O 0, 0
+.endm
+
+.macro LOAD4x4_2O OffsetA, OffsetB
+ lxvp vs34, (\OffsetB)(BO)
+ lxvp vs38, (32+\OffsetB)(BO)
+ lxvp vs32, (0+\OffsetA)(AO)
+ lxvp vs36, (32+\OffsetA)(AO)
+.endm
+
+.macro END4x4_2
+ /*for load2 offset will be 64 and 64*/
+ KERNEL4x4_2 AO, BO, 64, 64, 0, 1, 1
+.endm
+
+.macro KERNEL4x4_E2 OffsetA, OffsetB, Index, IsLast
+ KERNEL4x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
+.endm
+
+.macro KERNEL4x4_L2 OffsetA, OffsetB, Index, IsLast
+ KERNEL4x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
+.endm
+
+.macro KERNEL4x4_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+ xvf32gerpp 3, 32, 34
+ xvf32gerpp 2, 33, 34
+ xvf32gerpp 1, 32, 35
+ xvf32gerpp 0, 33, 35
+.if \Complete==0
+ lxvp vs34, DISP8(\Index, \OffsetB)(\BREG)
+ lxvp vs32, DISP8(\Index, 0+\OffsetA)(\AREG)
+.endif
+ xvf32gerpp 3, 36, 38
+ xvf32gerpp 2, 37, 38
+ xvf32gerpp 1, 36, 39
+ xvf32gerpp 0, 37, 39
+.if \Complete==0
+ lxvp vs38, DISP8(\Index, 32+\OffsetB)(\BREG)
+ lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG)
+.endif
+.if \IsLast==1
+.if \Complete==1
+ addi \BREG, \BREG, DISP8(\Index, \OffsetB)
+ addi \AREG, \AREG, DISP8(\Index, \OffsetA)
+.else
+ addi \BREG, \BREG, DISP8(\Index, 64)
+ addi \AREG, \AREG, DISP8(\Index, 64)
+.endif
+.endif
+.endm
+
+.macro KERNEL4x4
+ LOAD4x4
+ END4x4 AO, BO, 32, 32
+.endm
+
+.macro SAVE4x4
+ SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
+ SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45
+ SHUFFLE_ACC 2, vs8, vs9, vs10, vs11, vs34, vs42, vs38, vs46
+ SHUFFLE_ACC 3, vs12, vs13, vs14, vs15, vs35, vs43, vs39, vs47
+ add T4, LDC, LDC
+ add T1, CO, LDC
+#ifndef TRMMKERNEL
+ lxvp vs24, 0(CO)
+#endif
+ add T2, CO, T4
+ add T3, T1, T4
+#ifndef TRMMKERNEL
+ lxvp vs26, 0(T1)
+#endif
+ #ifndef TRMMKERNEL
+ lxvp vs28, 0(T2)
+#endif
+#ifndef TRMMKERNEL
+ lxvp vs30, 0(T3)
+#endif
+ GROUP1
+ AGG_GROUP1
+ GROUP2
+ AGG_GROUP2
+ /*VSINRR, VSINII, VSOUT1, VSOUT2*/
+ MULTIPLY_GROUP1
+ MULTIPLY_GROUP2
+/* reconstruct r, i pairs*/
+ RECONSTRUCT_PAIR1
+ RECONSTRUCT_PAIR2
+#ifndef TRMMKERNEL
+ /* add */
+ xxpermdi vs1, vs8, vs0, 2
+ xxpermdi vs3, vs10, vs2, 2
+ xxpermdi vs9, vs0, vs8, 2
+ xxpermdi vs11, vs2, vs10, 2
+ xxpermdi vs5, vs12, vs4, 2
+ xxpermdi vs7, vs14, vs6, 2
+ xxpermdi vs13, vs4, vs12, 2
+ xxpermdi vs15, vs6, vs14, 2
+ xvaddsp vs24, vs24, vs3
+ xvaddsp vs25, vs25, vs1
+ xvaddsp vs26, vs26, vs11
+ xvaddsp vs27, vs27, vs9
+ xvaddsp vs28, vs28, vs7
+ xvaddsp vs29, vs29, vs5
+ xvaddsp vs30, vs30, vs15
+ xvaddsp vs31, vs31, vs13
+#else
+ xxpermdi vs25, vs8, vs0, 2
+ xxpermdi vs24, vs10, vs2, 2
+ xxpermdi vs27, vs0, vs8, 2
+ xxpermdi vs26, vs2, vs10, 2
+ xxpermdi vs29, vs12, vs4, 2
+ xxpermdi vs28, vs14, vs6, 2
+ xxpermdi vs31, vs4, vs12, 2
+ xxpermdi vs30, vs6, vs14, 2
+#endif
+ stxvp vs24, 0(CO)
+ stxvp vs26, 0(T1)
+ stxvp vs28, 0(T2)
+ stxvp vs30, 0(T3)
+ addi CO, CO, 32
+.endm
+
+/* macros for N=4 and M=2
+**********************************************************************************************/
+
+.macro ZERO4x2
+ xxsetaccz 0
+ xxsetaccz 1
+.endm
+
+.macro LOAD4x2
+ LOAD4x2O 0, 0
+.endm
+
+.macro LOAD4x2O OffsetA, OffsetB
+ lxv vs32, (\OffsetA+0)(AO)
+ lxvp vs34, (\OffsetB+0)(BO)
+.endm
+
+.macro END4x2_NORMAL
+ END4x2 AO, BO, 16, 32
+.endm
+
+.macro END4x2_WITHOUT_ADD
+ END4x2 AO, BO, 0, 0
+.endm
+
+.macro END4x2 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+ xvf32gerpp 1, 34, 32
+ xvf32gerpp 0, 35, 32
+.endm
+
+.macro LOAD4x2_2
+ LOAD4x2_2O 0, 0
+.endm
+
+.macro LOAD4x2_2O OffsetA, OffsetB
+ lxvp vs32, (\OffsetA)(AO)
+ lxvp vs34, (0+\OffsetB)(BO)
+ lxvp vs36, (32+\OffsetB)(BO)
+.endm
+
+.macro END4x2_2
+ /*for load2 offset will be 32 and 64*/
+ KERNEL4x2_2 AO, BO, 32, 64, 0, 1, 1
+.endm
+
+.macro KERNEL4x2_E2 OffsetA, OffsetB, Index, IsLast
+ KERNEL4x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
+.endm
+
+.macro KERNEL4x2_L2 OffsetA, OffsetB, Index, IsLast
+ KERNEL4x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
+.endm
+
+.macro KERNEL4x2_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+ xvf32gerpp 1, 34, 33
+ xvf32gerpp 0, 35, 33
+.if \Complete==0
+ lxvp vs34, DISP8(\Index, 0+\OffsetB)(\BREG)
+.endif
+ xvf32gerpp 1, 36, 32
+ xvf32gerpp 0, 37, 32
+.if \Complete==0
+ lxvp vs32, DISP4(\Index, \OffsetA)(\AREG)
+ lxvp vs36, DISP8(\Index, 32+\OffsetB)(\BREG)
+.endif
+.if \IsLast==1
+.if \Complete==1
+ addi \AREG, \AREG, DISP4(\Index, \OffsetA)
+ addi \BREG, \BREG, DISP8(\Index, \OffsetB)
+.else
+ addi \AREG, \AREG, DISP4(\Index, 32)
+ addi \BREG, \BREG, DISP8(\Index, 64)
+.endif
+.endif
+.endm
+
+.macro KERNEL4x2
+ LOAD4x2
+ END4x2 AO, BO, 16, 32
+.endm
+
+.macro SAVE4x2
+ SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
+ SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45
+ add T4, LDC, LDC
+ add T1, CO, LDC
+ add T2, CO, T4
+ add T3, T1, T4
+#ifndef TRMMKERNEL
+ lxv vs24, 0(CO)
+#endif
+#ifndef TRMMKERNEL
+ lxv vs25, 0(T1)
+#endif
+#ifndef TRMMKERNEL
+ lxv vs26, 0(T2)
+#endif
+#ifndef TRMMKERNEL
+ lxv vs27, 0(T3)
+#endif
+ GROUP1
+ AGGREGATE_REALS_IMAGES_A_PERMUTE vs32, vs0, vs40, vs4
+ AGGREGATE_REALS_IMAGES_A_PERMUTE vs33, vs1, vs41, vs5
+ AGGREGATE_REALS_IMAGES_A_PERMUTE vs36, vs8, vs44, vs12
+ AGGREGATE_REALS_IMAGES_A_PERMUTE vs37, vs9, vs45, vs13
+ /*VSINRR, VSINII, VSOUT1, VSOUT2*/
+ MULTIPLY_GROUP1
+/* reconstruct r, i pairs*/
+ RECONSTRUCT_PAIR1
+#ifndef TRMMKERNEL
+ /* add */
+ xxpermdi vs1, vs8, vs0, 0
+ xxpermdi vs9, vs10, vs2, 0
+ xxpermdi vs3, vs0, vs8, 3
+ xxpermdi vs11, vs2, vs10, 3
+ xvaddsp vs24, vs24, vs1
+ xvaddsp vs26, vs26, vs9
+ xvaddsp vs25, vs25, vs3
+ xvaddsp vs27, vs27, vs11
+#else
+ xxpermdi vs24, vs8, vs0, 0
+ xxpermdi vs26, vs10, vs2, 0
+ xxpermdi vs25, vs0, vs8, 3
+ xxpermdi vs27, vs2, vs10, 3
+#endif
+ stxv vs24, 0(CO)
+ stxv vs25, 0(T1)
+ stxv vs26, 0(T2)
+ stxv vs27, 0(T3)
+ addi CO, CO, 16
+.endm
+
+/* macros for N=4 and M=2
+**********************************************************************************************/
+
+.macro ZERO4x1
+ xxsetaccz 0
+ xxsetaccz 1
+.endm
+
+.macro LOAD4x1
+ LOAD4x1O 0, 0
+.endm
+
+.macro LOAD4x1O OffsetA, OffsetB
+ lxsd v0, (\OffsetA+0)(AO)
+ lxvp vs34, (\OffsetB+0)(BO)
+.endm
+
+.macro END4x1_NORMAL
+ END4x1 AO, BO,8, 32
+.endm
+
+.macro END4x1_WITHOUT_ADD
+ END4x1 AO, BO, 0, 0
+.endm
+
+.macro END4x1 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+ xvf32gerpp 0, 35, 32
+ xvf32gerpp 1, 34, 32
+.endm
+
+.macro LOAD4x1_2
+ LOAD4x1_2O 0, 0
+.endm
+
+.macro LOAD4x1_2O OffsetA, OffsetB
+ lxv vs32, (\OffsetA)(AO)
+ vspltisb v6, 0
+ xxpermdi vs33, vs32, vs38, 0
+ xxpermdi vs32, vs32, vs38, 2
+ lxvp vs34, (0+\OffsetB)(BO)
+ lxvp vs36, (32+\OffsetB)(BO)
+.endm
+
+.macro END4x1_2
+ /*for load2 offset will be 16 and 64*/
+ KERNEL4x1_2 AO, BO, 16, 64, 0, 1, 1
+.endm
+
+.macro KERNEL4x1_E2 OffsetA, OffsetB, Index, IsLast
+ KERNEL4x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
+.endm
+
+.macro KERNEL4x1_L2 OffsetA, OffsetB, Index, IsLast
+ KERNEL4x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
+.endm
+
+.macro KERNEL4x1_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+ xvf32gerpp 0, 35, 32
+ xvf32gerpp 1, 34, 32
+.if \Complete==0
+ lxvp vs34, DISP8(\Index, 0+\OffsetB)(\BREG)
+.endif
+ xvf32gerpp 0, 37, 33
+ xvf32gerpp 1, 36, 33
+.if \Complete==0
+ lxv vs32, DISP2(\Index, \OffsetA)(\AREG)
+ lxvp vs36, DISP8(\Index, 32+\OffsetB)(\BREG)
+ xxpermdi vs33, vs32, vs38, 0
+ xxpermdi vs32, vs32, vs38, 2
+.endif
+.if \IsLast==1
+.if \Complete==1
+ addi \AREG, \AREG, DISP2(\Index, \OffsetA)
+ addi \BREG, \BREG, DISP8(\Index, \OffsetB)
+.else
+ addi \AREG, \AREG, DISP2(\Index, 16)
+ addi \BREG, \BREG, DISP8(\Index, 64)
+.endif
+.endif
+.endm
+
+.macro KERNEL4x1
+ LOAD4x1
+ END4x1 AO, BO, 8, 32
+.endm
+
+.macro SAVE4x1
+ SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
+ SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45
+ xxpermdi vs32, vs32, vs36, 1
+ xxpermdi vs40, vs40, vs44, 1
+ xxpermdi vs33, vs33, vs37, 1
+ xxpermdi vs41, vs41, vs45, 1
+ add T4, LDC, LDC
+ add T1, CO, LDC
+ add T2, CO, T4
+ add T3, T1, T4
+#ifndef TRMMKERNEL
+ lxsd v4, 0(CO)
+#endif
+#ifndef TRMMKERNEL
+ lxsd v5, 0(T1)
+#endif
+#ifndef TRMMKERNEL
+ lxsd v6, 0(T2)
+#endif
+#ifndef TRMMKERNEL
+ lxsd v7, 0(T3)
+#endif
+ xxperm vs0, vs32, permute_mask
+ xxperm vs4, vs40, permute_mask
+ xxperm vs1, vs33, permute_mask
+ xxperm vs5, vs41, permute_mask
+ AGGREGATE_REALS_IMAGES_A_PERMUTE vs32, vs0, vs40, vs4
+ AGGREGATE_REALS_IMAGES_A_PERMUTE vs33, vs1, vs41, vs5
+ /*VSINRR, VSINII, VSOUT1, VSOUT2*/
+ MULT_APLHA_PART1 vs32, vs40, vs0, vs1
+ MULT_APLHA_PART1 vs33, vs41, vs2, vs3
+ MULT_APLHA_PART2 vs32, vs40, vs0, vs1
+ MULT_APLHA_PART2 vs33, vs41, vs2, vs3
+/* reconstruct r, i pairs*/
+ xxperm vs0, vs1, save_permute_1
+ xxperm vs2, vs3, save_permute_1
+#ifndef TRMMKERNEL
+ /* add */
+ xxspltd vs1, vs0, 0
+ xxspltd vs3, vs0, 1
+ xxspltd vs9, vs2, 0
+ xxspltd vs11, vs2, 1
+ /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/
+ xvaddsp vs36, vs36, vs1
+ xvaddsp vs37, vs37, vs3
+ xvaddsp vs38, vs38, vs9
+ xvaddsp vs39, vs39, vs11
+#else
+ /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/
+ xxspltd vs36, vs0, 0
+ xxspltd vs37, vs0, 1
+ xxspltd vs38, vs2, 0
+ xxspltd vs39, vs2, 1
+#endif
+ stxsd v4, 0(CO)
+ stxsd v5, 0(T1)
+ stxsd v6, 0(T2)
+ stxsd v7, 0(T3)
+ addi CO, CO, 8
+.endm
+
+/* macros for N=2 and M=8
+**********************************************************************************************/
+
+.macro ZERO2x8
+ xxsetaccz 0
+ xxsetaccz 1
+ xxsetaccz 2
+ xxsetaccz 3
+.endm
+
+.macro LOAD2x8
+ LOAD2x8O 0, 0
+.endm
+
+.macro LOAD2x8O OffsetA, OffsetB
+ lxv vs34, (\OffsetB+0)(BO)
+ lxvp vs32, (\OffsetA+0)(AO)
+ lxvp vs36, (\OffsetA+32)(AO)
+.endm
+
+.macro END2x8_NORMAL
+ END2x8 AO, BO, 64, 16
+.endm
+
+.macro END2x8_WITHOUT_ADD
+ END2x8 AO, BO, 0, 0
+.endm
+
+.macro END2x8 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+ xvf32gerpp 2, 37, 34
+ xvf32gerpp 3, 36, 34
+ xvf32gerpp 0, 33, 34
+ xvf32gerpp 1, 32, 34
+.endm
+
+.macro LOAD2x8_2
+ LOAD2x8_2O 0, 0
+.endm
+
+.macro LOAD2x8_2O OffsetA, OffsetB
+ lxvp vs34, (\OffsetB)(BO)
+ lxvp vs32, (0+\OffsetA)(AO)
+ lxvp vs36, (32+\OffsetA)(AO)
+ lxvp vs38, (64+\OffsetA)(AO)
+ lxvp vs40, (64+32+\OffsetA)(AO)
+.endm
+
+.macro END2x8_2
+ /*for load2 offset will be 128 and 32*/
+ KERNEL2x8_2 AO, BO, 128, 32, 0, 1, 1
+.endm
+
+.macro KERNEL2x8_E2 OffsetA, OffsetB, Index, IsLast
+ KERNEL2x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
+.endm
+
+.macro KERNEL2x8_L2 OffsetA, OffsetB, Index, IsLast
+ KERNEL2x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
+.endm
+
+.macro KERNEL2x8_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+ xvf32gerpp 2, 37, 35
+ xvf32gerpp 3, 36, 35
+ xvf32gerpp 0, 33, 35
+ xvf32gerpp 1, 32, 35
+
+.if \Complete==0
+ lxvp vs32, DISP16(\Index, 0+\OffsetA)(\AREG)
+ lxvp vs36, DISP16(\Index, 32+\OffsetA)(\AREG)
+.endif
+ xvf32gerpp 2, 41, 34
+ xvf32gerpp 3, 40, 34
+ xvf32gerpp 0, 39, 34
+ xvf32gerpp 1, 38, 34
+
+.if \Complete==0
+ lxvp vs34, DISP4(\Index, \OffsetB)(\BREG)
+ lxvp vs38, DISP16(\Index, 64+\OffsetA)(\AREG)
+ lxvp vs40, DISP16(\Index, 64+32+\OffsetA)(\AREG)
+.endif
+.if \IsLast==1
+.if \Complete==1
+ addi \BREG, \BREG, DISP4(\Index, \OffsetB)
+ addi \AREG, \AREG, DISP16(\Index, \OffsetA)
+.else
+ addi \BREG, \BREG, DISP4(\Index, 32)
+ addi \AREG, \AREG, DISP16(\Index, 128)
+.endif
+.endif
+.endm
+
+.macro KERNEL2x8
+ LOAD2x8
+ END2x8 AO, BO, 64, 16
+.endm
+
+.macro SAVE2x8
+ SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
+ SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45
+ SHUFFLE_ACC 2, vs8, vs9, vs10, vs11, vs34, vs42, vs38, vs46
+ SHUFFLE_ACC 3, vs12, vs13, vs14, vs15, vs35, vs43, vs39, vs47
+ add T1, CO, LDC
+#ifndef TRMMKERNEL
+ lxvp vs24, 0(CO)
+#endif
+#ifndef TRMMKERNEL
+ lxvp vs26, 32(CO)
+#endif
+#ifndef TRMMKERNEL
+ lxvp vs28, 0(T1)
+#endif
+#ifndef TRMMKERNEL
+ lxvp vs30, 32(T1)
+#endif
+ add T2, CO, T4
+ add T3, T1, T4
+ GROUP1
+ AGG_GROUP1
+ GROUP2
+ AGG_GROUP2
+ /*VSINRR, VSINII, VSOUT1, VSOUT2*/
+ MULTIPLY_GROUP1
+ MULTIPLY_GROUP2
+/* reconstruct r, i pairs*/
+ RECONSTRUCT_PAIR1
+ RECONSTRUCT_PAIR2
+#ifndef TRMMKERNEL
+ /* add */
+ xxpermdi vs1, vs8, vs0, 2
+ xxpermdi vs3, vs10, vs2, 2
+ xxpermdi vs5, vs12, vs4, 2
+ xxpermdi vs7, vs14, vs6, 2
+ xxpermdi vs9, vs0, vs8, 2
+ xxpermdi vs11, vs2, vs10, 2
+ xvaddsp vs24, vs24, vs3
+ xvaddsp vs25, vs25, vs1
+ xxpermdi vs13, vs4, vs12, 2
+ xxpermdi vs15, vs6, vs14, 2
+ xvaddsp vs26, vs26, vs7
+ xvaddsp vs27, vs27, vs5
+ xvaddsp vs28, vs28, vs11
+ xvaddsp vs29, vs29, vs9
+ xvaddsp vs30, vs30, vs15
+ xvaddsp vs31, vs31, vs13
+#else
+ xxpermdi vs25, vs8, vs0, 2
+ xxpermdi vs24, vs10, vs2, 2
+ xxpermdi vs27, vs12, vs4, 2
+ xxpermdi vs26, vs14, vs6, 2
+ xxpermdi vs29, vs0, vs8, 2
+ xxpermdi vs28, vs2, vs10, 2
+ xxpermdi vs31, vs4, vs12, 2
+ xxpermdi vs30, vs6, vs14, 2
+#endif
+ stxvp vs24, 0(CO)
+ stxvp vs26, 32(CO)
+ stxvp vs28, 0(T1)
+ stxvp vs30, 32(T1)
+ addi CO, CO, 64
+.endm
+
+/* macros for N=2 and M=4
+**********************************************************************************************/
+
+.macro ZERO2x4
+ xxsetaccz 0
+ xxsetaccz 1
+.endm
+
+.macro LOAD2x4
+ LOAD2x4O 0, 0
+.endm
+
+.macro LOAD2x4O OffsetA, OffsetB
+ lxv vs34, (\OffsetB+0)(BO)
+ lxvp vs32, (\OffsetA+0)(AO)
+.endm
+
+.macro END2x4_NORMAL
+ END2x4 AO, BO, 32, 16
+.endm
+
+.macro END2x4_WITHOUT_ADD
+ END2x4 AO, BO, 0, 0
+.endm
+
+.macro END2x4 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+ xvf32gerpp 0, 33, 34
+ xvf32gerpp 1, 32, 34
+.endm
+
+.macro LOAD2x4_2
+ LOAD2x4_2O 0, 0
+.endm
+
+.macro LOAD2x4_2O OffsetA, OffsetB
+ lxvp vs34, (\OffsetB)(BO)
+ lxvp vs32, (0+\OffsetA)(AO)
+ lxvp vs36, (32+\OffsetA)(AO)
+.endm
+
+.macro END2x4_2
+ /*for load2 offset will be 64 and 32*/
+ KERNEL2x4_2 AO, BO, 64, 32, 0, 1, 1
+.endm
+
+.macro KERNEL2x4_E2 OffsetA, OffsetB, Index, IsLast
+ KERNEL2x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
+.endm
+
+.macro KERNEL2x4_L2 OffsetA, OffsetB, Index, IsLast
+ KERNEL2x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
+.endm
+
+.macro KERNEL2x4_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+ xvf32gerpp 0, 33, 35
+ xvf32gerpp 1, 32, 35
+.if \Complete==0
+ lxvp vs32, DISP8(\Index, 0+\OffsetA)(\AREG)
+.endif
+ xvf32gerpp 0, 37, 34
+ xvf32gerpp 1, 36, 34
+.if \Complete==0
+ lxvp vs34, DISP4(\Index, \OffsetB)(\BREG)
+ lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG)
+.endif
+.if \IsLast==1
+.if \Complete==1
+ addi \BREG, \BREG, DISP4(\Index, \OffsetB)
+ addi \AREG, \AREG, DISP8(\Index, \OffsetA)
+.else
+ addi \BREG, \BREG, DISP4(\Index, 32)
+ addi \AREG, \AREG, DISP8(\Index, 64)
+.endif
+.endif
+.endm
+
+.macro KERNEL2x4
+ LOAD2x4
+ END2x4 AO, BO, 32, 16
+.endm
+
+.macro SAVE2x4
+ SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
+ SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45
+ add T1, CO, LDC
+#ifndef TRMMKERNEL
+ lxvp vs24, 0(CO)
+#endif
+#ifndef TRMMKERNEL
+ lxvp vs26, 0(T1)
+#endif
+ GROUP1
+ AGG_GROUP1
+ /*VSINRR, VSINII, VSOUT1, VSOUT2*/
+ MULTIPLY_GROUP1
+/* reconstruct r, i pairs*/
+ RECONSTRUCT_PAIR1
+#ifndef TRMMKERNEL
+ /* add */
+ xxpermdi vs1, vs8, vs0, 2
+ xxpermdi vs3, vs10, vs2, 2
+ xxpermdi vs9, vs0, vs8, 2
+ xxpermdi vs11, vs2, vs10, 2
+ xvaddsp vs24, vs24, vs3
+ xvaddsp vs25, vs25, vs1
+ xvaddsp vs26, vs26, vs11
+ xvaddsp vs27, vs27, vs9
+#else
+ xxpermdi vs25, vs8, vs0, 2
+ xxpermdi vs24, vs10, vs2, 2
+ xxpermdi vs27, vs0, vs8, 2
+ xxpermdi vs26, vs2, vs10, 2
+#endif
+ stxvp vs24, 0(CO)
+ stxvp vs26, 0(T1)
+ addi CO, CO, 32
+.endm
+
+/* macros for N=2 and M=2
+**********************************************************************************************/
+
+.macro ZERO2x2
+ xxsetaccz 0
+.endm
+
+.macro LOAD2x2
+ LOAD2x2O 0, 0
+.endm
+
+.macro LOAD2x2O OffsetA, OffsetB
+ lxv vs32, (\OffsetA+0)(AO)
+ lxv vs34, (\OffsetB+0)(BO)
+.endm
+
+.macro END2x2_NORMAL
+ END2x2 AO, BO, 16, 16
+.endm
+
+.macro END2x2_WITHOUT_ADD
+ END2x2 AO, BO, 0, 0
+.endm
+
+.macro END2x2 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+ xvf32gerpp 0, 34, 32
+.endm
+
+.macro LOAD2x2_2
+ LOAD2x2_2O 0, 0
+.endm
+
+.macro LOAD2x2_2O OffsetA, OffsetB
+ lxvp vs32, (\OffsetA)(AO)
+ lxvp vs34, (0+\OffsetB)(BO)
+.endm
+
+.macro END2x2_2
+ /*for load2 offset will be 32 and 32*/
+ KERNEL2x2_2 AO, BO, 32, 32, 0, 1, 1
+.endm
+
+.macro KERNEL2x2_E2 OffsetA, OffsetB, Index, IsLast
+ KERNEL2x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
+.endm
+
+.macro KERNEL2x2_L2 OffsetA, OffsetB, Index, IsLast
+ KERNEL2x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
+.endm
+
+.macro KERNEL2x2_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+ xvf32gerpp 0, 34, 32
+ xvf32gerpp 0, 35, 33
+.if \Complete==0
+ lxvp vs32, DISP4(\Index, \OffsetA)(\AREG)
+ lxvp vs34, DISP4(\Index, \OffsetA)(\BREG)
+.endif
+.if \IsLast==1
+.if \Complete==1
+ addi \AREG, \AREG, DISP4(\Index, \OffsetA)
+ addi \BREG, \BREG, DISP4(\Index, \OffsetB)
+.else
+ addi \AREG, \AREG, DISP4(\Index, 32)
+ addi \BREG, \BREG, DISP4(\Index, 32)
+.endif
+.endif
+.endm
+
+.macro KERNEL2x2
+ LOAD2x2
+ END2x2 AO, BO, 16, 16
+.endm
+
+.macro SAVE2x2
+ SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
+ add T1, CO, LDC
+#ifndef TRMMKERNEL
+ lxv vs24, 0(CO)
+#endif
+#ifndef TRMMKERNEL
+ lxv vs26, 0(T1)
+#endif
+ xxperm vs0, vs32, permute_mask
+ xxperm vs4, vs40, permute_mask
+ xxperm vs8, vs36, permute_mask
+ xxperm vs12, vs44, permute_mask
+ AGGREGATE_REALS_IMAGES_A_PERMUTE vs32, vs0, vs40, vs4
+ AGGREGATE_REALS_IMAGES_A_PERMUTE vs36, vs8, vs44, vs12
+ /*VSINRR, VSINII, VSOUT1, VSOUT2*/
+ MULT_APLHA_PART1 vs32, vs40, vs0, vs1
+ MULT_APLHA_PART1 vs36, vs44, vs8, vs9
+ MULT_APLHA_PART2 vs32, vs40, vs0, vs1
+ MULT_APLHA_PART2 vs36, vs44, vs8, vs9
+/* reconstruct r, i pairs*/
+ xxperm vs0, vs1, save_permute_1
+ xxperm vs8, vs9, save_permute_1
+#ifndef TRMMKERNEL
+ /* add */
+ xxpermdi vs1, vs8, vs0, 0
+ xxpermdi vs9, vs0, vs8, 3
+ xvaddsp vs24, vs24, vs1
+ xvaddsp vs26, vs26, vs9
+#else
+ xxpermdi vs24, vs8, vs0, 0
+ xxpermdi vs26, vs0, vs8, 3
+#endif
+ stxv vs24, 0(CO)
+ stxv vs26, 0(T1)
+ addi CO, CO, 16
+.endm
+
+/* macros for N=2 and M=1
+**********************************************************************************************/
+
+.macro ZERO2x1
+ xxlxor vs32, vs32, vs32
+ xxlxor vs40, vs40, vs40
+.endm
+
+.macro LOAD2x1
+ LOAD2x1O 0, 0
+.endm
+
+.macro LOAD2x1O OffsetA, OffsetB
+ lxsd v4, (\OffsetA+0)(AO)
+ lxv vs0, (\OffsetB+0)(BO)
+ xxspltd vs24, vs36, 0
+ xxperm vs26, vs24, permute_mask
+.endm
+
+.macro END2x1_NORMAL
+ END2x1 AO, BO,8, 16
+.endm
+
+.macro END2x1_WITHOUT_ADD
+ END2x1 AO, BO, 0, 0
+.endm
+
+.macro END2x1 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+ xvmaddasp vs32, vs0, vs24
+ xvmaddasp vs40, vs0, vs26
+.endm
+
+.macro LOAD2x1_2
+ LOAD2x1_2O 0, 0
+.endm
+
+.macro LOAD2x1_2O OffsetA, OffsetB
+ lxv vs27, (\OffsetA)(AO)
+ lxvp vs4, (0+\OffsetB)(BO)
+ xxspltd vs8, vs27, 1
+ xxspltd vs24, vs27, 0
+ xxperm vs10, vs8, permute_mask
+ xxperm vs26, vs24, permute_mask
+.endm
+
+.macro END2x1_2
+ /*for load2 offset will be 16 and 32*/
+ KERNEL2x1_2 AO, BO, 16, 32, 0, 1, 1
+.endm
+
+.macro KERNEL2x1_E2 OffsetA, OffsetB, Index, IsLast
+ KERNEL2x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
+.endm
+
+.macro KERNEL2x1_L2 OffsetA, OffsetB, Index, IsLast
+ KERNEL2x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
+.endm
+
+.macro KERNEL2x1_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+ xvmaddasp vs32, vs5, vs8
+ xvmaddasp vs40, vs5, vs10
+.if \Complete==0
+ lxv vs27, DISP2(\Index, \OffsetA)(\AREG)
+ xxspltd vs8, vs27, 1
+.endif
+.if \Complete==0
+ xxperm vs10, vs8, permute_mask
+.endif
+ xvmaddasp vs32, vs4, vs24
+ xvmaddasp vs40, vs4, vs26
+.if \Complete==0
+ xxspltd vs24, vs27, 0
+ xxperm vs26, vs24, permute_mask
+.endif
+.if \Complete==0
+ lxvp vs4, DISP4(\Index, 0+\OffsetB)(\BREG)
+.endif
+.if \IsLast==1
+.if \Complete==1
+ addi \AREG, \AREG, DISP2(\Index, \OffsetA)
+ addi \BREG, \BREG, DISP4(\Index, \OffsetB)
+.else
+ addi \AREG, \AREG, DISP2(\Index, 16)
+ addi \BREG, \BREG, DISP4(\Index, 32)
+.endif
+.endif
+.endm
+
+.macro KERNEL2x1
+ LOAD2x1
+ END2x1 AO, BO, 8, 16
+.endm
+
+.macro SAVE2x1
+ add T1, CO, LDC
+#ifndef TRMMKERNEL
+ lxsd v4, 0(CO)
+#endif
+#ifndef TRMMKERNEL
+ lxsd v5, 0(T1)
+#endif
+ xxperm vs0, vs32, permute_mask
+ xxperm vs4, vs40, permute_mask
+ AGGREGATE_REALS_IMAGES_A_PERMUTE vs32, vs0, vs40, vs4
+ AGGREGATE_REALS_IMAGES_A_PERMUTE vs33, vs1, vs41, vs5
+ /*VSINRR, VSINII, VSOUT1, VSOUT2*/
+ MULT_APLHA_PART1 vs32, vs40, vs0, vs1
+ MULT_APLHA_PART2 vs32, vs40, vs0, vs1
+/* reconstruct r, i pairs*/
+ xxperm vs0, vs1, save_permute_1
+#ifndef TRMMKERNEL
+ /* add */
+ xxspltd vs1, vs0, 0
+ xxspltd vs3, vs0, 1
+ /*--v4==vs36 v5==vs37---*/
+ xvaddsp vs36, vs36, vs1
+ xvaddsp vs37, vs37, vs3
+#else
+ /*--v4==vs36 v5==vs37---*/
+ xxspltd vs36, vs0, 0
+ xxspltd vs37, vs0, 1
+#endif
+ stxsd v4, 0(CO)
+ stxsd v5, 0(T1)
+ addi CO, CO, 8
+.endm
+
+/* macros for N=1 and M=8
+**********************************************************************************************/
+
+.macro ZERO1x8
+ xxsetaccz 0
+ xxsetaccz 1
+ xxsetaccz 2
+ xxsetaccz 3
+.endm
+
+.macro LOAD1x8
+ LOAD1x8O 0, 0
+.endm
+
+.macro LOAD1x8O OffsetA, OffsetB
+ lxsd v2, (\OffsetB+0)(BO)
+ lxvp vs32, (\OffsetA+0)(AO)
+ lxvp vs36, (\OffsetA+32)(AO)
+.endm
+
+.macro END1x8_NORMAL
+ END1x8 AO, BO, 64,8
+.endm
+
+.macro END1x8_WITHOUT_ADD
+ END1x8 AO, BO, 0, 0
+.endm
+
+.macro END1x8 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+ xvf32gerpp 0, 34, 33
+ xvf32gerpp 1, 34, 32
+ xvf32gerpp 2, 34, 37
+ xvf32gerpp 3, 34, 36
+.endm
+
+.macro LOAD1x8_2
+ LOAD1x8_2O 0, 0
+.endm
+
+.macro LOAD1x8_2O OffsetA, OffsetB
+ lxv vs34, (\OffsetB)(BO)
+ lxvp vs32, (0+\OffsetA)(AO)
+ lxvp vs36, (32+\OffsetA)(AO)
+ vspltisb v10, 0
+ xxpermdi vs35, vs34, vs42, 0
+ xxpermdi vs34, vs34, vs42, 2
+ lxvp vs38, (64+\OffsetA)(AO)
+ lxvp vs40, (64+32+\OffsetA)(AO)
+.endm
+
+.macro END1x8_2
+ /*for load2 offset will be 128 and 16*/
+ KERNEL1x8_2 AO, BO, 128, 16, 0, 1, 1
+.endm
+
+.macro KERNEL1x8_E2 OffsetA, OffsetB, Index, IsLast
+ KERNEL1x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
+.endm
+
+.macro KERNEL1x8_L2 OffsetA, OffsetB, Index, IsLast
+ KERNEL1x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
+.endm
+
+.macro KERNEL1x8_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+ xvf32gerpp 0, 34, 33
+ xvf32gerpp 1, 34, 32
+.if \Complete==0
+ lxvp vs32, DISP16(\Index, 0+\OffsetA)(\AREG)
+.endif
+ xvf32gerpp 2, 34, 37
+ xvf32gerpp 3, 34, 36
+.if \Complete==0
+ lxvp vs36, DISP16(\Index, 32+\OffsetA)(\AREG)
+.endif
+ xvf32gerpp 0, 35, 39
+ xvf32gerpp 1, 35, 38
+.if \Complete==0
+ lxvp vs38, DISP16(\Index, 64+\OffsetA)(\AREG)
+.endif
+ xvf32gerpp 2, 35, 41
+ xvf32gerpp 3, 35, 40
+.if \Complete==0
+ lxv vs34, DISP2(\Index, \OffsetB)(\BREG)
+ xxpermdi vs35, vs34, vs42, 0
+ xxpermdi vs34, vs34, vs42, 2
+ lxvp vs40, DISP16(\Index, 64+32+\OffsetA)(\AREG)
+.endif
+.if \IsLast==1
+.if \Complete==1
+ addi \BREG, \BREG, DISP2(\Index, \OffsetB)
+ addi \AREG, \AREG, DISP16(\Index, \OffsetA)
+.else
+ addi \BREG, \BREG, DISP2(\Index, 16)
+ addi \AREG, \AREG, DISP16(\Index, 128)
+.endif
+.endif
+.endm
+
+.macro KERNEL1x8
+ LOAD1x8
+ END1x8 AO, BO, 64,8
+.endm
+
+.macro SAVE1x8
+ SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
+ SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45
+ SHUFFLE_ACC 2, vs8, vs9, vs10, vs11, vs34, vs42, vs38, vs46
+ SHUFFLE_ACC 3, vs12, vs13, vs14, vs15, vs35, vs43, vs39, vs47
+ xxpermdi vs32, vs32, vs36, 0
+ xxpermdi vs33, vs33, vs37, 0
+ xxpermdi vs34, vs34, vs38, 0
+ xxpermdi vs35, vs35, vs39, 0
+ xxpermdi vs40, vs40, vs44, 0
+ xxperm vs40, vs40, permute_mask
+ xxpermdi vs41, vs41, vs45, 0
+ xxperm vs41, vs41, permute_mask
+ xxpermdi vs42, vs42, vs46, 0
+ xxperm vs42, vs42, permute_mask
+ xxpermdi vs43, vs43, vs47, 0
+ xxperm vs43, vs43, permute_mask
+#ifndef TRMMKERNEL
+ lxvp vs24, 0(CO)
+#endif
+ xxperm vs0, vs32, permute_mask
+ xxperm vs4, vs40, permute_mask
+#ifndef TRMMKERNEL
+ lxvp vs26, 32(CO)
+#endif
+ xxperm vs1, vs33, permute_mask
+ xxperm vs5, vs41, permute_mask
+ xxperm vs2, vs34, permute_mask
+ xxperm vs6, vs42, permute_mask
+ xxperm vs3, vs35, permute_mask
+ xxperm vs7, vs43, permute_mask
+ AGGREGATE_REALS_IMAGES vs32, vs0, vs40, vs4
+ AGGREGATE_REALS_IMAGES vs33, vs1, vs41, vs5
+ AGGREGATE_REALS_IMAGES vs34, vs2, vs42, vs6
+ AGGREGATE_REALS_IMAGES vs35, vs3, vs43, vs7
+ /*inner reverse save_permute and store vs28 */
+ xxpermdi vs28,save_permute_1,save_permute_1, 2
+ /*VSINRR, VSINII, VSOUT1, VSOUT2*/
+ MULT_APLHA_PART1 vs32, vs40, vs0, vs1
+ MULT_APLHA_PART1 vs33, vs41, vs2, vs3
+ MULT_APLHA_PART1 vs34, vs42, vs4, vs5
+ MULT_APLHA_PART1 vs35, vs43, vs6, vs7
+ MULT_APLHA_PART2 vs32, vs40, vs0, vs1
+ MULT_APLHA_PART2 vs33, vs41, vs2, vs3
+ MULT_APLHA_PART2 vs34, vs42, vs4, vs5
+ MULT_APLHA_PART2 vs35, vs43, vs6, vs7
+/* reconstruct r, i pairs*/
+ xxperm vs0, vs1, vs28
+ xxperm vs2, vs3, vs28
+ xxperm vs4, vs5, vs28
+ xxperm vs6, vs7, vs28
+#ifndef TRMMKERNEL
+ /* add */
+ xvaddsp vs24, vs24, vs2
+ xvaddsp vs25, vs25, vs0
+ xvaddsp vs26, vs26, vs6
+ xvaddsp vs27, vs27, vs4
+ stxvp vs24, 0(CO)
+ stxvp vs26, 32(CO)
+#else
+/* reconstruct r, i pairs*/
+ stxv vs0, 0(CO)
+ stxv vs2, 16(CO)
+ stxv vs4, 32(CO)
+ stxv vs6, 48(CO)
+#endif
+ addi CO, CO, 64
+.endm
+
+/* macros for N=1 and M=4
+**********************************************************************************************/
+
+.macro ZERO1x4
+ xxsetaccz 0
+ xxsetaccz 1
+.endm
+
+.macro LOAD1x4
+ LOAD1x4O 0, 0
+.endm
+
+.macro LOAD1x4O OffsetA, OffsetB
+ lxsd v2, (\OffsetB+0)(BO)
+ lxvp vs32, (\OffsetA+0)(AO)
+.endm
+
+.macro END1x4_NORMAL
+ END1x4 AO, BO, 32,8
+.endm
+
+.macro END1x4_WITHOUT_ADD
+ END1x4 AO, BO, 0, 0
+.endm
+
+.macro END1x4 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+ xvf32gerpp 0, 34, 33
+ xvf32gerpp 1, 34, 32
+.endm
+
+.macro LOAD1x4_2
+ LOAD1x4_2O 0, 0
+.endm
+
+.macro LOAD1x4_2O OffsetA, OffsetB
+ lxv vs34, (\OffsetB)(BO)
+ lxvp vs32, (0+\OffsetA)(AO)
+ vspltisb v6, 0
+ xxpermdi vs35, vs34, vs38, 0
+ xxpermdi vs34, vs34, vs38, 2
+ lxvp vs36, (32+\OffsetA)(AO)
+.endm
+
+.macro END1x4_2
+ /*for load2 offset will be 64 and 16*/
+ KERNEL1x4_2 AO, BO, 64, 16, 0, 1, 1
+.endm
+
+.macro KERNEL1x4_E2 OffsetA, OffsetB, Index, IsLast
+ KERNEL1x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
+.endm
+
+.macro KERNEL1x4_L2 OffsetA, OffsetB, Index, IsLast
+ KERNEL1x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
+.endm
+
+.macro KERNEL1x4_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+ xvf32gerpp 0, 34, 33
+ xvf32gerpp 1, 34, 32
+.if \Complete==0
+ lxvp vs32, DISP8(\Index, 0+\OffsetA)(\AREG)
+.endif
+ xvf32gerpp 0, 35, 37
+ xvf32gerpp 1, 35, 36
+.if \Complete==0
+ lxv vs34, DISP2(\Index, \OffsetB)(\BREG)
+ xxpermdi vs35, vs34, vs38, 0
+ xxpermdi vs34, vs34, vs38, 2
+ lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG)
+.endif
+.if \IsLast==1
+.if \Complete==1
+ addi \BREG, \BREG, DISP2(\Index, \OffsetB)
+ addi \AREG, \AREG, DISP8(\Index, \OffsetA)
+.else
+ addi \BREG, \BREG, DISP2(\Index, 16)
+ addi \AREG, \AREG, DISP8(\Index, 64)
+.endif
+.endif
+.endm
+
+.macro KERNEL1x4
+ LOAD1x4
+ END1x4 AO, BO, 32,8
+.endm
+
+.macro SAVE1x4
+ SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
+ SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45
+ xxpermdi vs32, vs32, vs36, 0
+ xxpermdi vs40, vs40, vs44, 0
+ xxpermdi vs33, vs33, vs37, 0
+ xxpermdi vs41, vs41, vs45, 0
+ xxperm vs40, vs40, permute_mask
+ xxperm vs41, vs41, permute_mask
+#ifndef TRMMKERNEL
+ lxvp vs24, 0(CO)
+#endif
+ xxperm vs0, vs32, permute_mask
+ xxperm vs4, vs40, permute_mask
+ xxperm vs1, vs33, permute_mask
+ xxperm vs5, vs41, permute_mask
+ AGGREGATE_REALS_IMAGES vs32, vs0, vs40, vs4
+ AGGREGATE_REALS_IMAGES vs33, vs1, vs41, vs5
+ /*inner reverse save_permute and store vs28 */
+ xxpermdi vs28,save_permute_1,save_permute_1, 2
+ /*VSINRR, VSINII, VSOUT1, VSOUT2*/
+ MULT_APLHA_PART1 vs32, vs40, vs0, vs1
+ MULT_APLHA_PART1 vs33, vs41, vs2, vs3
+ MULT_APLHA_PART2 vs32, vs40, vs0, vs1
+ MULT_APLHA_PART2 vs33, vs41, vs2, vs3
+/* reconstruct r, i pairs*/
+ xxperm vs0, vs1, vs28
+ xxperm vs2, vs3, vs28
+#ifndef TRMMKERNEL
+ /* add */
+ xvaddsp vs24, vs24, vs2
+ xvaddsp vs25, vs25, vs0
+ stxvp vs24, 0(CO)
+#else
+/* reconstruct r, i pairs*/
+ stxv vs0, 0(CO)
+ stxv vs2, 16(CO)
+#endif
+ addi CO, CO, 32
+.endm
+
+/* macros for N=1 and M=2
+**********************************************************************************************/
+
+.macro ZERO1x2
+ xxlxor vs32, vs32, vs32
+ xxlxor vs40, vs40, vs40
+.endm
+
+.macro LOAD1x2
+ LOAD1x2O 0, 0
+.endm
+
+.macro LOAD1x2O OffsetA, OffsetB
+ lxsd vs4, (\OffsetB+0)(BO)
+ lxv vs0, (\OffsetA+0)(AO)
+ xxspltd vs24, vs36, 0
+ xxperm vs26, vs24, permute_mask
+.endm
+
+.macro END1x2_NORMAL
+ END1x2 AO, BO, 16,8
+.endm
+
+.macro END1x2_WITHOUT_ADD
+ END1x2 AO, BO, 0, 0
+.endm
+
+.macro END1x2 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+ xvmaddasp vs32, vs0, vs24
+ xvmaddasp vs40, vs0, vs26
+.endm
+
+.macro LOAD1x2_2
+ LOAD1x2_2O 0, 0
+.endm
+
+.macro LOAD1x2_2O OffsetA, OffsetB
+ lxv vs27, (\OffsetB)(BO)
+ lxvp vs4, (0+\OffsetA)(AO)
+ xxspltd vs8, vs27, 1
+ xxspltd vs24, vs27, 0
+ xxperm vs10, vs8, permute_mask
+ xxperm vs26, vs24, permute_mask
+.endm
+
+.macro END1x2_2
+ /*for load2 offset will be 32 and 16*/
+ KERNEL1x2_2 AO, BO, 32, 16, 0, 1, 1
+.endm
+
+.macro KERNEL1x2_E2 OffsetA, OffsetB, Index, IsLast
+ KERNEL1x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
+.endm
+
+.macro KERNEL1x2_L2 OffsetA, OffsetB, Index, IsLast
+ KERNEL1x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
+.endm
+
+.macro KERNEL1x2_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+.if \Complete==0
+ lxv vs27, DISP2(\Index, \OffsetB)(\BREG)
+.endif
+ xvmaddasp vs32, vs5, vs8
+ xvmaddasp vs40, vs5, vs10
+
+.if \Complete==0
+ xxspltd vs8, vs27, 1
+ xxperm vs10, vs8, permute_mask
+.endif
+ xvmaddasp vs32, vs4, vs24
+ xvmaddasp vs40, vs4, vs26
+.if \Complete==0
+ lxvp vs4, DISP4(\Index, 0+\OffsetA)(\AREG)
+.endif
+
+.if \Complete==0
+ xxspltd vs24, vs27, 0
+ xxperm vs26, vs24, permute_mask
+.endif
+.if \IsLast==1
+.if \Complete==1
+ addi \BREG, \BREG, DISP2(\Index, \OffsetB)
+ addi \AREG, \AREG, DISP4(\Index, \OffsetA)
+.else
+ addi \BREG, \BREG, DISP2(\Index, 16)
+ addi \AREG, \AREG, DISP4(\Index, 32)
+.endif
+.endif
+.endm
+
+.macro KERNEL1x2
+ LOAD1x2
+ END1x2 AO, BO, 16,8
+.endm
+
+.macro SAVE1x2
+#ifndef TRMMKERNEL
+ lxv vs24, 0(CO)
+#endif
+ xxperm vs0, vs32, permute_mask
+ xxperm vs4, vs40, permute_mask
+ AGGREGATE_REALS_IMAGES vs32, vs0, vs40, vs4
+ /*inner reverse save_permute and store vs28 */
+ xxpermdi vs28,save_permute_1,save_permute_1, 2
+ /*VSINRR, VSINII, VSOUT1, VSOUT2*/
+ MULT_APLHA_PART1 vs32, vs40, vs0, vs1
+ MULT_APLHA_PART2 vs32, vs40, vs0, vs1
+/* reconstruct r, i pairs*/
+ xxperm vs0, vs1, vs28
+#ifndef TRMMKERNEL
+ /* add */
+ xvaddsp vs24, vs24, vs0
+ stxv vs24, 0(CO)
+#else
+/* reconstruct r, i pairs*/
+ stxv vs0, 0(CO)
+#endif
+ addi CO, CO, 16
+.endm
+
+/* macros for N=1 and M=1
+**********************************************************************************************/
+.macro ZERO1x1
+ xxlxor vs32, vs32, vs32
+ xxlxor vs40, vs40, vs40
+.endm
+
+.macro LOAD1x1
+ LOAD1x1O 0, 0
+.endm
+
+.macro LOAD1x1O OffsetA, OffsetB
+ lxsd v4, (\OffsetB+0)(BO)
+ lxsd v5, (\OffsetA+0)(AO)
+ xxperm vs38, vs36, permute_mask
+.endm
+
+.macro END1x1_NORMAL
+ END1x1 AO, BO,8,8
+.endm
+
+.macro END1x1_WITHOUT_ADD
+ END1x1 AO, BO, 0, 0
+.endm
+
+.macro END1x1 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+ addi \BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+ addi \AREG, \AREG, \OffsetA
+.endif
+ xvmaddasp vs32, vs37, vs36
+ xvmaddasp vs40, vs37, vs38
+.endm
+
+.macro LOAD1x1_2
+ LOAD1x1_2O 0, 0
+.endm
+
+.macro LOAD1x1_2O OffsetA, OffsetB
+ lxv vs8, (\OffsetB)(BO)
+ lxv vs4, (0+\OffsetA)(AO)
+ xxperm vs10, vs8, permute_mask
+.endm
+
+.macro END1x1_2
+ /*for load2 offset will be 16 and 16*/
+ KERNEL1x1_2 AO, BO, 16, 16, 0, 1, 1
+.endm
+
+.macro KERNEL1x1_E2 OffsetA, OffsetB, Index, IsLast
+ KERNEL1x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
+.endm
+
+.macro KERNEL1x1_L2 OffsetA, OffsetB, Index, IsLast
+ KERNEL1x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
+.endm
+
+.macro KERNEL1x1_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+ xvmaddasp vs32, vs4, vs8
+ xvmaddasp vs40, vs4, vs10
+.if \Complete==0
+ lxv vs8, DISP2(\Index, \OffsetB)(\BREG)
+ lxv vs4, DISP2(\Index, \OffsetB)(\AREG)
+ xxperm vs10, vs8, permute_mask
+.endif
+.if \IsLast==1
+.if \Complete==1
+ addi \BREG, \BREG, DISP2(\Index, \OffsetB)
+ addi \AREG, \AREG, DISP2(\Index, \OffsetA)
+.else
+ addi \BREG, \BREG, DISP2(\Index, 16)
+ addi \AREG, \AREG, DISP2(\Index, 16)
+.endif
+.endif
+.endm
+
+.macro KERNEL1x1
+ LOAD1x1
+ END1x1 AO, BO, 8,8
+.endm
+
+.macro SAVE1x1
+#ifndef TRMMKERNEL
+ lxsd v4, 0(CO)
+#endif
+ /*aggregate x2*/
+ xxpermdi vs33, vs32, vs32, 2
+ xxpermdi vs41, vs40, vs40, 2
+ xvaddsp vs32, vs32, vs33
+ xvaddsp vs40, vs40, vs41
+
+ xxperm vs0, vs32, permute_mask
+ xxperm vs4, vs40, permute_mask
+ AGGREGATE_REALS_IMAGES vs32, vs0, vs40, vs4
+ /*inner reverse save_permute and store vs28 */
+ xxpermdi vs28,save_permute_1,save_permute_1, 2
+ /*VSINRR, VSINII, VSOUT1, VSOUT2*/
+ MULT_APLHA_PART1 vs32, vs40, vs37, vs1
+ MULT_APLHA_PART2 vs32, vs40, vs37, vs1
+/* reconstruct r, i pairs*/
+ xxperm vs37, vs1, vs28
+#ifndef TRMMKERNEL
+ /* add */
+ xvaddsp vs36, vs36, vs37
+ stxsd v4, 0(CO)
+#else
+/* vs37 is v5 */
+ stxsd v5, 0(CO)
+#endif
+ addi CO, CO, 8
+.endm
+
+/****************************TRMM POINTER REFRESH MACROSES*************************/
+.macro SHIFT_REG REG1,REG2,SHIFT_VAL
+.if \SHIFT_VAL==16
+ slwi \REG1, \REG2, 7
+.elseif \SHIFT_VAL==8
+ slwi \REG1, \REG2, 6
+.elseif \SHIFT_VAL==4
+ slwi \REG1, \REG2, 5
+.elseif \SHIFT_VAL==2
+ slwi \REG1, \REG2, 4
+.elseif \SHIFT_VAL==1
+ slwi \REG1, \REG2, 3
+.endif
+.endm
+
+/*
+//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+// ptrbb = bb;
+// #else
+// ptrba += off*8;
+// ptrbb = bb + off*4;
+// #endif
+*/
+.macro REFRESH_POINTERS PTR_A,PTR_B, OFF_VAL, B_VAL, C_A, C_B
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+/* ptrbb = bb;*/
+ mr \PTR_B, \B_VAL /* refresh BPOINT */
+#else
+/*
+// ptrba =ptrba+ off*C_A;
+// ptrbb = bb + off*C_B;
+*/
+ SHIFT_REG T4, \OFF_VAL, \C_B /* Number of values in B shifted */
+ SHIFT_REG T2, \OFF_VAL, \C_A /* Number of values in A shifted */
+ add \PTR_B, \B_VAL, T4 /* Add values to BO */
+ add \PTR_A, \PTR_A, T2 /* Add values to AO */
+#endif
+.endm
+
+/*
+// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+// temp = bk-off;
+// #elif defined(LEFT)
+// temp = off+8; // number of values in A
+// #else
+// temp = off+4; // number of values in B
+// #endif
+*/
+.macro REFRESH_TEMP_BK TEMP_BK, BK_VAL, OFF_VAL, INCR_A, INCR_B
+ #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ /* temp = bk-off;*/
+ sub \TEMP_BK, \BK_VAL, \OFF_VAL
+ #elif defined(LEFT)
+ /* temp = off+INCR_A; // number of values in A */
+ addi \TEMP_BK, \OFF_VAL, \INCR_A
+ #else
+ /* temp = off+INCR_B // number of values in B*/
+ addi \TEMP_BK, \OFF_VAL, \INCR_B
+ #endif
+.endm
+/*
+// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+// temp = bk - off;
+// #ifdef LEFT
+// temp -= 8; // number of values in A
+// #else
+// temp -= 4; // number of values in B
+// #endif
+// ptrba += temp*8;
+// ptrbb += temp*4;
+// #endif
+
+// #ifdef LEFT
+// off += 8; // number of values in A
+// #endif
+*/
+.macro REFRESH_AFTER_SAVE TEMP_BK, BK_VAL, OFF_VAL,PTR_B,PTR_A, C_A, C_B
+ #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ /*temp = bk - off;*/
+ sub \TEMP_BK, \BK_VAL, \OFF_VAL
+ #ifdef LEFT
+ /*temp -= 8; // number of values in A*/
+ addi \TEMP_BK, \TEMP_BK,-\C_A
+ #else
+ /*temp -= 4; // number of values in B*/
+ addi \TEMP_BK, \TEMP_BK,-\C_B
+ #endif
+ /*ptrba += temp*C_A;
+ ptrbb += temp*C_B;*/
+ SHIFT_REG T4, \TEMP_BK, \C_A
+ SHIFT_REG T2, \TEMP_BK, \C_B
+ add \PTR_A, \PTR_A, T4/*ptrba+temp*C_A*/
+ add \PTR_B, \PTR_B, T2
+ #endif
+ #ifdef LEFT
+ /*off += 8; // number of values in A*/
+ addi \OFF_VAL, \OFF_VAL, \C_A
+ #endif
+.endm
--- /dev/null
+/*********************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+#include "common.h"
+#include <altivec.h>
+
+typedef unsigned char vec_t __attribute__ ((vector_size (16)));
+typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
+typedef FLOAT v2sf_t __attribute__ ((vector_size (8)));
+
+#ifdef TRMMKERNEL
+#define SAVE_ACC(ACC, J) \
+ __builtin_mma_disassemble_acc (result, ACC); \
+ rowC = (v4sf_t *) &CO[0* ldc+J]; \
+ rowC[0] = result[3] * alpha; \
+ rowC = (v4sf_t *) &CO[1*ldc+J]; \
+ rowC[0] = result[2] * alpha; \
+ rowC = (v4sf_t *) &CO[2*ldc+J]; \
+ rowC[0] = result[1] * alpha; \
+ rowC = (v4sf_t *) &CO[3*ldc+J]; \
+ rowC[0] = result[0] * alpha;
+#define SAVE_ACC1(ACC, J) \
+ __builtin_mma_disassemble_acc (result, ACC); \
+ rowC = (v4sf_t *) &CO[4* ldc+J]; \
+ rowC[0] = result[3] * alpha; \
+ rowC = (v4sf_t *) &CO[5*ldc+J]; \
+ rowC[0] = result[2] * alpha; \
+ rowC = (v4sf_t *) &CO[6*ldc+J]; \
+ rowC[0] = result[1] * alpha; \
+ rowC = (v4sf_t *) &CO[7*ldc+J]; \
+ rowC[0] = result[0] * alpha;
+#define SAVE2x4_ACC(ACC, J) \
+ __builtin_mma_disassemble_acc (result, ACC); \
+ rowC = (v4sf_t *) &CO[0* ldc+J]; \
+ rowC[0] = result[3] * alpha; \
+ rowC = (v4sf_t *) &CO[1* ldc+J]; \
+ rowC[0] = result[2] * alpha;
+#else
+#define SAVE_ACC(ACC, J) \
+ __builtin_mma_disassemble_acc (result, ACC); \
+ rowC = (v4sf_t *) &CO[0* ldc+J]; \
+ rowC[0] += result[3] * alpha; \
+ rowC = (v4sf_t *) &CO[1*ldc+J]; \
+ rowC[0] += result[2] * alpha; \
+ rowC = (v4sf_t *) &CO[2*ldc+J]; \
+ rowC[0] += result[1] * alpha; \
+ rowC = (v4sf_t *) &CO[3*ldc+J]; \
+ rowC[0] += result[0] * alpha;
+#define SAVE_ACC1(ACC, J) \
+ __builtin_mma_disassemble_acc (result, ACC); \
+ rowC = (v4sf_t *) &CO[4* ldc+J]; \
+ rowC[0] += result[3] * alpha; \
+ rowC = (v4sf_t *) &CO[5*ldc+J]; \
+ rowC[0] += result[2] * alpha; \
+ rowC = (v4sf_t *) &CO[6*ldc+J]; \
+ rowC[0] += result[1] * alpha; \
+ rowC = (v4sf_t *) &CO[7*ldc+J]; \
+ rowC[0] += result[0] * alpha;
+#define SAVE2x4_ACC(ACC, J) \
+ __builtin_mma_disassemble_acc (result, ACC); \
+ rowC = (v4sf_t *) &CO[0* ldc+J]; \
+ rowC[0] += result[3] * alpha; \
+ rowC = (v4sf_t *) &CO[1* ldc+J]; \
+ rowC[0] += result[2] * alpha;
+#endif
+
+#define SET_ACC_ZERO4() \
+ __builtin_mma_xxsetaccz (&acc0); \
+ __builtin_mma_xxsetaccz (&acc1); \
+ __builtin_mma_xxsetaccz (&acc2); \
+ __builtin_mma_xxsetaccz (&acc3);
+
+#define SET_ACC_ZERO8() \
+ __builtin_mma_xxsetaccz (&acc0); \
+ __builtin_mma_xxsetaccz (&acc1); \
+ __builtin_mma_xxsetaccz (&acc2); \
+ __builtin_mma_xxsetaccz (&acc3); \
+ __builtin_mma_xxsetaccz (&acc4); \
+ __builtin_mma_xxsetaccz (&acc5); \
+ __builtin_mma_xxsetaccz (&acc6); \
+ __builtin_mma_xxsetaccz (&acc7);
+
+#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+#define REFRESH_TEMP_BK(x, y) \
+ temp = k - off;
+#elif defined(LEFT)
+#define REFRESH_TEMP_BK(x, y) \
+ temp = off + x;
+#else
+#define REFRESH_TEMP_BK(x, y) \
+ temp = off + y;
+#endif
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+#define REFRESH_POINTERS(x, y) \
+ BO = B; \
+ REFRESH_TEMP_BK(x, y)
+#else
+#define REFRESH_POINTERS(x, y) \
+ AO += off * x; \
+ BO = B + off * y; \
+ REFRESH_TEMP_BK(x, y)
+#endif
+
+#ifdef LEFT
+#define REFRESH_OFF(x) \
+ off += x;
+#else
+#define REFRESH_OFF(x)
+#endif
+
+#ifdef LEFT
+#define UPDATE_TEMP(x, y) \
+ temp -= x;
+#else
+#define UPDATE_TEMP(x, y) \
+ temp -= y;
+#endif
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+#define REFRESH_TMP_AFTER_SAVE(x, y) \
+ temp = k - off; \
+ UPDATE_TEMP(x, y) \
+ AO += temp * x; \
+ BO += temp * y;
+#else
+#define REFRESH_TMP_AFTER_SAVE(x, y)
+#endif
+
+#define REFRESH_AFTER_SAVE(x,y) \
+ REFRESH_TMP_AFTER_SAVE(x, y) \
+ REFRESH_OFF(x)
+/*************************************************************************************
+* GEMM Kernel
+*************************************************************************************/
+int
+CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
+ FLOAT * C, BLASLONG ldc
+#ifdef TRMMKERNEL
+ , BLASLONG offset
+#endif
+ )
+{
+ BLASLONG N = n;
+ BLASLONG i1;
+#if defined(TRMMKERNEL)
+ BLASLONG off;
+#endif
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ off = -offset;
+#endif
+ v4sf_t valpha = { alpha, alpha };
+ N = n >> 2;
+ for (i1 = 0; i1 < N; i1++)
+ {
+ BLASLONG i, j, temp;
+ FLOAT *CO;
+ FLOAT *AO;
+#if defined(TRMMKERNEL) && defined(LEFT)
+ off = offset;
+#endif
+ CO = C;
+ C += ldc << 2;
+ AO = A;
+ PREFETCH1 (A, 128);
+ PREFETCH1 (A, 256);
+ i = m >> 4;
+ for (j = 0; j < i; j++)
+ {
+ FLOAT *BO;
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS (16, 4);
+#else
+ BO = B;
+ temp = k;
+#endif
+ v4sf_t *rowC;
+ v4sf_t result[4];
+ BLASLONG l = 0;
+ PREFETCH1 (CO, 0);
+ PREFETCH1 (CO + ldc, 0);
+ PREFETCH1 (CO + ldc + ldc, 0);
+ PREFETCH1 (CO + ldc + ldc + ldc, 0);
+ PREFETCH1 (CO, 128);
+ PREFETCH1 (CO + ldc, 128);
+ PREFETCH1 (CO + ldc + ldc, 128);
+ PREFETCH1 (CO + ldc + ldc + ldc, 128);
+ __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
+ SET_ACC_ZERO8 ();
+ for (l = 0; l < temp; l++)
+ {
+ vec_t *rowA = (vec_t *) & AO[l << 4];
+ __vector_pair rowB;
+ vec_t *rb = (vec_t *) & BO[l << 2];
+ __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+ __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
+ __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
+ __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
+ __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]);
+ __builtin_mma_xvf64gerpp (&acc4, rowB, rowA[4]);
+ __builtin_mma_xvf64gerpp (&acc5, rowB, rowA[5]);
+ __builtin_mma_xvf64gerpp (&acc6, rowB, rowA[6]);
+ __builtin_mma_xvf64gerpp (&acc7, rowB, rowA[7]);
+ }
+ SAVE_ACC (&acc0, 0);
+ SAVE_ACC (&acc2, 4);
+ SAVE_ACC (&acc1, 2);
+ SAVE_ACC (&acc3, 6);
+ SAVE_ACC (&acc4, 8);
+ SAVE_ACC (&acc6, 12);
+ SAVE_ACC (&acc5, 10);
+ SAVE_ACC (&acc7, 14);
+ AO += temp << 4;
+ BO += temp << 2;
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE (16, 4)
+#endif
+ CO += 16;
+ }
+ i = (m & 15) >> 3;
+ for (j = 0; j < i; j++)
+ {
+ FLOAT *BO;
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS (8, 4);
+#else
+ BO = B;
+ temp = k;
+#endif
+ v4sf_t *rowC;
+ v4sf_t result[4];
+ __vector_quad acc0, acc1, acc2, acc3;
+ SET_ACC_ZERO4 ();
+ BLASLONG l = 0;
+ for (l = 0; l < temp; l++)
+ {
+ vec_t *rowA = (vec_t *) & AO[l << 3];
+ __vector_pair rowB;
+ vec_t *rb = (vec_t *) & BO[l << 2];
+ __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+ __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
+ __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
+ __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
+ __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]);
+ }
+ SAVE_ACC (&acc0, 0);
+ SAVE_ACC (&acc2, 4);
+ SAVE_ACC (&acc1, 2);
+ SAVE_ACC (&acc3, 6);
+ CO += 8;
+ AO += temp << 3;
+ BO += temp << 2;
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE (8, 4)
+#endif
+ }
+ i = (m & 7) >> 2;
+ for (j = 0; j < i; j++)
+ {
+ FLOAT *BO;
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS (4, 4);
+#else
+ BO = B;
+ temp = k;
+#endif
+ v4sf_t *rowC;
+ v4sf_t result[4];
+ __vector_quad acc0, acc1;
+ __builtin_mma_xxsetaccz (&acc0);
+ __builtin_mma_xxsetaccz (&acc1);
+ BLASLONG l = 0;
+ for (l = 0; l < temp; l++)
+ {
+ vec_t *rowA = (vec_t *) & AO[l << 2];
+ __vector_pair rowB;
+ vec_t *rb = (vec_t *) & BO[l << 2];
+ __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+ __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
+ __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
+ }
+ SAVE_ACC (&acc0, 0);
+ SAVE_ACC (&acc1, 2);
+ CO += 4;
+ AO += temp << 2;
+ BO += temp << 2;
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE (4, 4)
+#endif
+ }
+ i = (m & 3) >> 1;
+ for (j = 0; j < i; j++)
+ {
+ FLOAT *BO;
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS (2, 4);
+#else
+ BO = B;
+ temp = k;
+#endif
+ v4sf_t *rowC;
+ v4sf_t result[4];
+ __vector_quad acc0;
+ __builtin_mma_xxsetaccz (&acc0);
+ BLASLONG l = 0;
+ for (l = 0; l < temp; l++)
+ {
+ vec_t *rowA = (vec_t *) & AO[l << 1];
+ __vector_pair rowB;
+ vec_t *rb = (vec_t *) & BO[l << 2];
+ __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+ __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
+ }
+ SAVE_ACC (&acc0, 0);
+ CO += 2;
+ AO += temp << 1;
+ BO += temp << 2;
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE (2, 4)
+#endif
+ }
+ i = (m & 1) >> 0;
+ for (j = 0; j < i; j++)
+ {
+ FLOAT *BO;
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS (1, 4);
+#else
+ BO = B;
+ temp = k;
+#endif
+ BLASLONG l = 0;
+ v4sf_t t = { 0, 0 };
+ v4sf_t t1 = { 0, 0 };
+ for (l = 0; l < temp; l++)
+ {
+ v4sf_t rowA = { AO[l], AO[l] };
+ v4sf_t rowB = { BO[l << 2], BO[(l << 2) + 1] };
+ v4sf_t rowB1 = { BO[(l << 2) + 2], BO[(l << 2) + 3] };
+ t += rowA * rowB;
+ t1 += rowA * rowB1;
+ }
+ t = t * valpha;
+ t1 = t1 * valpha;
+#if defined(TRMMKERNEL)
+ CO[0 * ldc] = t[0];
+ CO[1 * ldc] = t[1];
+ CO[2 * ldc] = t1[0];
+ CO[3 * ldc] = t1[1];
+#else
+ CO[0 * ldc] += t[0];
+ CO[1 * ldc] += t[1];
+ CO[2 * ldc] += t1[0];
+ CO[3 * ldc] += t1[1];
+#endif
+ CO += 1;
+ AO += temp;
+ BO += temp << 2;
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE (1, 4)
+#endif
+ }
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ off += 4; // number of values in A
+#endif
+ B += k << 2;
+ }
+ N = (n & 3) >> 1;
+ for (i1 = 0; i1 < N; i1++)
+ {
+ BLASLONG i, j, temp;
+#if defined(TRMMKERNEL) && defined(LEFT)
+ off = offset;
+#endif
+ FLOAT *CO;
+ FLOAT *AO;
+ CO = C;
+ C += ldc << 1;
+ AO = A;
+ i = m >> 4;
+ for (j = 0; j < i; j++)
+ {
+ FLOAT *BO;
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS (16, 2);
+#else
+ BO = B;
+ temp = k;
+#endif
+ v4sf_t *rowC;
+ v4sf_t result[4];
+ __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
+ SET_ACC_ZERO8 ();
+ BLASLONG l = 0;
+ for (l = 0; l < temp; l++)
+ {
+ FLOAT t[4] = { 0, 0, 0, 0 };
+ t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
+ __vector_pair rowB;
+ vec_t *rb = (vec_t *) & t[0];
+ __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+ vec_t *rowA = (vec_t *) & AO[l << 4];
+ __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
+ __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
+ __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
+ __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]);
+ __builtin_mma_xvf64gerpp (&acc4, rowB, rowA[4]);
+ __builtin_mma_xvf64gerpp (&acc5, rowB, rowA[5]);
+ __builtin_mma_xvf64gerpp (&acc6, rowB, rowA[6]);
+ __builtin_mma_xvf64gerpp (&acc7, rowB, rowA[7]);
+ }
+ SAVE2x4_ACC (&acc0, 0);
+ SAVE2x4_ACC (&acc1, 2);
+ SAVE2x4_ACC (&acc2, 4);
+ SAVE2x4_ACC (&acc3, 6);
+ SAVE2x4_ACC (&acc4, 8);
+ SAVE2x4_ACC (&acc5, 10);
+ SAVE2x4_ACC (&acc6, 12);
+ SAVE2x4_ACC (&acc7, 14);
+ CO += 16;
+ AO += temp << 4;
+ BO += temp << 1;
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE (16, 2)
+#endif
+ }
+ i = (m & 15) >> 3;
+ for (j = 0; j < i; j++)
+ {
+ FLOAT *BO;
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS (8, 2);
+#else
+ BO = B;
+ temp = k;
+#endif
+ v4sf_t *rowC;
+ v4sf_t result[4];
+ __vector_quad acc0, acc1, acc2, acc3;
+ SET_ACC_ZERO4 ();
+ BLASLONG l = 0;
+ for (l = 0; l < temp; l++)
+ {
+ FLOAT t[4] = { 0, 0, 0, 0 };
+ t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
+ __vector_pair rowB;
+ vec_t *rb = (vec_t *) & t[0];
+ __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+ vec_t *rowA = (vec_t *) & AO[l << 3];
+ __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
+ __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
+ __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
+ __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]);
+ }
+ SAVE2x4_ACC (&acc0, 0);
+ SAVE2x4_ACC (&acc1, 2);
+ SAVE2x4_ACC (&acc2, 4);
+ SAVE2x4_ACC (&acc3, 6);
+ CO += 8;
+ AO += temp << 3;
+ BO += temp << 1;
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE (8, 2)
+#endif
+ }
+ i = (m & 7) >> 2;
+ for (j = 0; j < i; j++)
+ {
+ FLOAT *BO;
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS (4, 2);
+#else
+ BO = B;
+ temp = k;
+#endif
+ v4sf_t *rowC;
+ v4sf_t result[4];
+ __vector_quad acc0, acc1;
+ __builtin_mma_xxsetaccz (&acc0);
+ __builtin_mma_xxsetaccz (&acc1);
+ BLASLONG l = 0;
+ for (l = 0; l < temp; l++)
+ {
+ FLOAT t[4] = { 0, 0, 0, 0 };
+ t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
+ __vector_pair rowB;
+ vec_t *rb = (vec_t *) & t[0];
+ __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+ vec_t *rowA = (vec_t *) & AO[l << 2];
+ __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
+ __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
+ }
+ SAVE2x4_ACC (&acc0, 0);
+ SAVE2x4_ACC (&acc1, 2);
+ CO += 4;
+ AO += temp << 2;
+ BO += temp << 1;
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE (4, 2)
+#endif
+ }
+ i = (m & 3) >> 1;
+ for (j = 0; j < i; j++)
+ {
+ FLOAT *BO;
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS (2, 2);
+#else
+ BO = B;
+ temp = k;
+#endif
+ v4sf_t *rowC;
+ v4sf_t result[4];
+ __vector_quad acc0;
+ __builtin_mma_xxsetaccz (&acc0);
+ BLASLONG l = 0;
+ for (l = 0; l < temp; l++)
+ {
+ FLOAT t[4] = { 0, 0, 0, 0 };
+ t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
+ __vector_pair rowB;
+ vec_t *rb = (vec_t *) & t[0];
+ __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+ vec_t *rowA = (vec_t *) & AO[l << 1];
+ __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
+ }
+ SAVE2x4_ACC (&acc0, 0);
+ CO += 2;
+ AO += temp << 1;
+ BO += temp << 1;
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE (2, 2)
+#endif
+ }
+ i = (m & 1) >> 0;
+ for (j = 0; j < i; j++)
+ {
+ FLOAT *BO;
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS (1, 2);
+#else
+ BO = B;
+ temp = k;
+#endif
+ BLASLONG l = 0;
+ v4sf_t t = { 0, 0 };
+ for (l = 0; l < temp; l++)
+ {
+ v4sf_t rowA = { AO[l], AO[l] };
+ v4sf_t rowB = { BO[l << 1], BO[(l << 1) + 1] };
+ t += rowA * rowB;
+ }
+ t = t * valpha;
+#if defined(TRMMKERNEL)
+ CO[0 * ldc] = t[0];
+ CO[1 * ldc] = t[1];
+#else
+ CO[0 * ldc] += t[0];
+ CO[1 * ldc] += t[1];
+#endif
+ CO += 1;
+ AO += temp;
+ BO += temp << 1;
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE (1, 2)
+#endif
+ }
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ off += 2; // number of values in A
+#endif
+ B += k << 1;
+ }
+ N = (n & 1) >> 0;
+ for (i1 = 0; i1 < N; i1++)
+ {
+ BLASLONG i, temp;
+#if defined(TRMMKERNEL) && defined(LEFT)
+ off = offset;
+#endif
+ FLOAT *CO;
+ FLOAT *AO;
+ CO = C;
+ C += ldc;
+ AO = A;
+ i = m;
+ while (i >= 16)
+ {
+ FLOAT *BO;
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS (16, 1)
+#else
+ BO = B;
+ temp = k;
+#endif
+ BLASLONG l = 0;
+ v4sf_t t = { 0, 0 };
+ v4sf_t t1 = { 0, 0 };
+ v4sf_t t2 = { 0, 0 };
+ v4sf_t t3 = { 0, 0 };
+ v4sf_t t4 = { 0, 0 };
+ v4sf_t t5 = { 0, 0 };
+ v4sf_t t6 = { 0, 0 };
+ v4sf_t t7 = { 0, 0 };
+ for (l = 0; l < temp; l++)
+ {
+ v4sf_t rowB = { BO[l], BO[l] };
+ v4sf_t rowA = { AO[l << 4], AO[(l << 4) + 1] };
+ v4sf_t rowA1 = { AO[(l << 4) + 2], AO[(l << 4) + 3] };
+ v4sf_t rowA2 = { AO[(l << 4) + 4], AO[(l << 4) + 5] };
+ v4sf_t rowA3 = { AO[(l << 4) + 6], AO[(l << 4) + 7] };
+ v4sf_t rowA4 = { AO[(l << 4) + 8], AO[(l << 4) + 9] };
+ v4sf_t rowA5 = { AO[(l << 4) + 10], AO[(l << 4) + 11] };
+ v4sf_t rowA6 = { AO[(l << 4) + 12], AO[(l << 4) + 13] };
+ v4sf_t rowA7 = { AO[(l << 4) + 14], AO[(l << 4) + 15] };
+ t += rowA * rowB;
+ t1 += rowA1 * rowB;
+ t2 += rowA2 * rowB;
+ t3 += rowA3 * rowB;
+ t4 += rowA4 * rowB;
+ t5 += rowA5 * rowB;
+ t6 += rowA6 * rowB;
+ t7 += rowA7 * rowB;
+ }
+ t = t * valpha;
+ t1 = t1 * valpha;
+ t2 = t2 * valpha;
+ t3 = t3 * valpha;
+ t4 = t4 * valpha;
+ t5 = t5 * valpha;
+ t6 = t6 * valpha;
+ t7 = t7 * valpha;
+#if defined(TRMMKERNEL)
+ CO[0] = t[0];
+ CO[1] = t[1];
+ CO[2] = t1[0];
+ CO[3] = t1[1];
+ CO[4] = t2[0];
+ CO[5] = t2[1];
+ CO[6] = t3[0];
+ CO[7] = t3[1];
+ CO[8] = t4[0];
+ CO[9] = t4[1];
+ CO[10] = t5[0];
+ CO[11] = t5[1];
+ CO[12] = t6[0];
+ CO[13] = t6[1];
+ CO[14] = t7[0];
+ CO[15] = t7[1];
+#else
+ CO[0] += t[0];
+ CO[1] += t[1];
+ CO[2] += t1[0];
+ CO[3] += t1[1];
+ CO[4] += t2[0];
+ CO[5] += t2[1];
+ CO[6] += t3[0];
+ CO[7] += t3[1];
+ CO[8] += t4[0];
+ CO[9] += t4[1];
+ CO[10] += t5[0];
+ CO[11] += t5[1];
+ CO[12] += t6[0];
+ CO[13] += t6[1];
+ CO[14] += t7[0];
+ CO[15] += t7[1];
+#endif
+ AO += temp << 4;
+ BO += temp;
+ CO += 16;
+ i -= 16;
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE (16, 1)
+#endif
+ }
+ while (i >= 8)
+ {
+ FLOAT *BO;
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS (8, 1)
+#else
+ BO = B;
+ temp = k;
+#endif
+ BLASLONG l = 0;
+ v4sf_t t = { 0, 0 };
+ v4sf_t t1 = { 0, 0 };
+ v4sf_t t2 = { 0, 0 };
+ v4sf_t t3 = { 0, 0 };
+ for (l = 0; l < temp; l++)
+ {
+ v4sf_t rowB = { BO[l], BO[l] };
+ v4sf_t rowA = { AO[l << 3], AO[(l << 3) + 1] };
+ v4sf_t rowA1 = { AO[(l << 3) + 2], AO[(l << 3) + 3] };
+ v4sf_t rowA2 = { AO[(l << 3) + 4], AO[(l << 3) + 5] };
+ v4sf_t rowA3 = { AO[(l << 3) + 6], AO[(l << 3) + 7] };
+ t += rowA * rowB;
+ t1 += rowA1 * rowB;
+ t2 += rowA2 * rowB;
+ t3 += rowA3 * rowB;
+ }
+ t = t * valpha;
+ t1 = t1 * valpha;
+ t2 = t2 * valpha;
+ t3 = t3 * valpha;
+#if defined(TRMMKERNEL)
+ CO[0] = t[0];
+ CO[1] = t[1];
+ CO[2] = t1[0];
+ CO[3] = t1[1];
+ CO[4] = t2[0];
+ CO[5] = t2[1];
+ CO[6] = t3[0];
+ CO[7] = t3[1];
+#else
+ CO[0] += t[0];
+ CO[1] += t[1];
+ CO[2] += t1[0];
+ CO[3] += t1[1];
+ CO[4] += t2[0];
+ CO[5] += t2[1];
+ CO[6] += t3[0];
+ CO[7] += t3[1];
+#endif
+ AO += temp << 3;
+ BO += temp;
+ CO += 8;
+ i -= 8;
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE (8, 1)
+#endif
+ }
+ while (i >= 4)
+ {
+ FLOAT *BO;
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS (4, 1)
+#else
+ BO = B;
+ temp = k;
+#endif
+ BLASLONG l = 0;
+ v4sf_t t = { 0, 0 };
+ v4sf_t t1 = { 0, 0 };
+ for (l = 0; l < temp; l++)
+ {
+ v4sf_t rowB = { BO[l], BO[l] };
+ v4sf_t rowA = { AO[l << 2], AO[(l << 2) + 1] };
+ v4sf_t rowA1 = { AO[(l << 2) + 2], AO[(l << 2) + 3] };
+ t += rowA * rowB;
+ t1 += rowA1 * rowB;
+ }
+ t = t * valpha;
+ t1 = t1 * valpha;
+#if defined(TRMMKERNEL)
+ CO[0] = t[0];
+ CO[1] = t[1];
+ CO[2] = t1[0];
+ CO[3] = t1[1];
+#else
+ CO[0] += t[0];
+ CO[1] += t[1];
+ CO[2] += t1[0];
+ CO[3] += t1[1];
+#endif
+ AO += temp << 2;
+ BO += temp;
+ CO += 4;
+ i -= 4;
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE (4, 1)
+#endif
+ }
+ while (i >= 2)
+ {
+ FLOAT *BO;
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS (2, 1)
+#else
+ BO = B;
+ temp = k;
+#endif
+ BLASLONG l = 0;
+ v4sf_t t = { 0, 0 };
+ for (l = 0; l < temp; l++)
+ {
+ v4sf_t rowB = { BO[l], BO[l] };
+ v4sf_t rowA = { AO[l << 1], AO[(l << 1) + 1] };
+ t += rowA * rowB;
+ }
+ t = t * valpha;
+#if defined(TRMMKERNEL)
+ CO[0] = t[0];
+ CO[1] = t[1];
+#else
+ CO[0] += t[0];
+ CO[1] += t[1];
+#endif
+ AO += temp << 1;
+ BO += temp;
+ CO += 2;
+ i -= 2;
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE (2, 1)
+#endif
+ }
+ while (i >= 1)
+ {
+ FLOAT *BO;
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS (1, 1)
+#else
+ BO = B;
+ temp = k;
+#endif
+ BLASLONG l = 0;
+ FLOAT t = 0;
+ for (l = 0; l < temp; l++)
+ {
+ t += AO[l] * BO[l];
+ }
+ AO += temp;
+ BO += temp;
+#if defined(TRMMKERNEL)
+ CO[0] = t * alpha;
+#else
+ CO[0] += t * alpha;
+#endif
+ CO += 1;
+ i -= 1;
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE (1, 1)
+#endif
+ }
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ off += 1; // number of values in A
+#endif
+ B += k;
+ }
+ return 0;
+}
--- /dev/null
+/*********************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+#include "common.h"
+#include <altivec.h>
+
+typedef unsigned char vec_t __attribute__ ((vector_size (16)));
+typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
+typedef FLOAT v2sf_t __attribute__ ((vector_size (8)));
+#if defined(TRMMKERNEL)
+#define SAVE_ACC(ACC, J) \
+ __builtin_mma_disassemble_acc (result, ACC); \
+ rowC = (v4sf_t *) &CO[0* ldc+J]; \
+ rowC[0] = result[3] * alpha; \
+ rowC = (v4sf_t *) &CO[1*ldc+J]; \
+ rowC[0] = result[2] * alpha; \
+ rowC = (v4sf_t *) &CO[2*ldc+J]; \
+ rowC[0] = result[1] * alpha; \
+ rowC = (v4sf_t *) &CO[3*ldc+J]; \
+ rowC[0] = result[0] * alpha;
+#define SAVE_ACC1(ACC, J) \
+ __builtin_mma_disassemble_acc (result, ACC); \
+ rowC = (v4sf_t *) &CO[4* ldc+J]; \
+ rowC[0] = result[3] * alpha; \
+ rowC = (v4sf_t *) &CO[5*ldc+J]; \
+ rowC[0] = result[2] * alpha; \
+ rowC = (v4sf_t *) &CO[6*ldc+J]; \
+ rowC[0] = result[1] * alpha; \
+ rowC = (v4sf_t *) &CO[7*ldc+J]; \
+ rowC[0] = result[0] * alpha;
+#define SAVE4x2_ACC(ACC, J) \
+ __builtin_mma_disassemble_acc (result, ACC); \
+ rowC = (v2sf_t *) &CO[0* ldc+J]; \
+ rowC[0] = result[6] * alpha; \
+ rowC = (v2sf_t *) &CO[1* ldc+J]; \
+ rowC[0] = result[4] * alpha; \
+ rowC = (v2sf_t *) &CO[2* ldc+J]; \
+ rowC[0] = result[2] * alpha; \
+ rowC = (v2sf_t *) &CO[3* ldc+J]; \
+ rowC[0] = result[0] * alpha;
+#define SAVE4x2_ACC1(ACC, J) \
+ __builtin_mma_disassemble_acc (result, ACC); \
+ rowC = (v2sf_t *) &CO[4* ldc+J]; \
+ rowC[0] = result[6] * alpha; \
+ rowC = (v2sf_t *) &CO[5* ldc+J]; \
+ rowC[0] = result[4] * alpha; \
+ rowC = (v2sf_t *) &CO[6* ldc+J]; \
+ rowC[0] = result[2] * alpha; \
+ rowC = (v2sf_t *) &CO[7* ldc+J]; \
+ rowC[0] = result[0] * alpha;
+#define SAVE2x4_ACC(ACC, J) \
+ __builtin_mma_disassemble_acc (result, ACC); \
+ rowC = (v4sf_t *) &CO[0* ldc+J]; \
+ rowC[0] = result[3] * alpha; \
+ rowC = (v4sf_t *) &CO[1* ldc+J]; \
+ rowC[0] = result[2] * alpha;
+#else
+#define SAVE_ACC(ACC, J) \
+ __builtin_mma_disassemble_acc (result, ACC); \
+ rowC = (v4sf_t *) &CO[0* ldc+J]; \
+ rowC[0] += result[3] * alpha; \
+ rowC = (v4sf_t *) &CO[1*ldc+J]; \
+ rowC[0] += result[2] * alpha; \
+ rowC = (v4sf_t *) &CO[2*ldc+J]; \
+ rowC[0] += result[1] * alpha; \
+ rowC = (v4sf_t *) &CO[3*ldc+J]; \
+ rowC[0] += result[0] * alpha;
+#define SAVE_ACC1(ACC, J) \
+ __builtin_mma_disassemble_acc (result, ACC); \
+ rowC = (v4sf_t *) &CO[4* ldc+J]; \
+ rowC[0] += result[3] * alpha; \
+ rowC = (v4sf_t *) &CO[5*ldc+J]; \
+ rowC[0] += result[2] * alpha; \
+ rowC = (v4sf_t *) &CO[6*ldc+J]; \
+ rowC[0] += result[1] * alpha; \
+ rowC = (v4sf_t *) &CO[7*ldc+J]; \
+ rowC[0] += result[0] * alpha;
+#define SAVE4x2_ACC(ACC, J) \
+ __builtin_mma_disassemble_acc (result, ACC); \
+ rowC = (v2sf_t *) &CO[0* ldc+J]; \
+ rowC[0] += result[6] * alpha; \
+ rowC = (v2sf_t *) &CO[1* ldc+J]; \
+ rowC[0] += result[4] * alpha; \
+ rowC = (v2sf_t *) &CO[2* ldc+J]; \
+ rowC[0] += result[2] * alpha; \
+ rowC = (v2sf_t *) &CO[3* ldc+J]; \
+ rowC[0] += result[0] * alpha;
+#define SAVE4x2_ACC1(ACC, J) \
+ __builtin_mma_disassemble_acc (result, ACC); \
+ rowC = (v2sf_t *) &CO[4* ldc+J]; \
+ rowC[0] += result[6] * alpha; \
+ rowC = (v2sf_t *) &CO[5* ldc+J]; \
+ rowC[0] += result[4] * alpha; \
+ rowC = (v2sf_t *) &CO[6* ldc+J]; \
+ rowC[0] += result[2] * alpha; \
+ rowC = (v2sf_t *) &CO[7* ldc+J]; \
+ rowC[0] += result[0] * alpha;
+#define SAVE2x4_ACC(ACC, J) \
+ __builtin_mma_disassemble_acc (result, ACC); \
+ rowC = (v4sf_t *) &CO[0* ldc+J]; \
+ rowC[0] += result[3] * alpha; \
+ rowC = (v4sf_t *) &CO[1* ldc+J]; \
+ rowC[0] += result[2] * alpha;
+#endif
+#define KERNEL(i, j) \
+ __builtin_mma_xvf32gerpp (&acc0, rowB[i], rowA[j]); \
+ __builtin_mma_xvf32gerpp (&acc1, rowB[i+1], rowA[j]); \
+ __builtin_mma_xvf32gerpp (&acc2, rowB[i], rowA[j+1]); \
+ __builtin_mma_xvf32gerpp (&acc3, rowB[i+1], rowA[j+1]); \
+ __builtin_mma_xvf32gerpp (&acc4, rowB[i], rowA[j+2]); \
+ __builtin_mma_xvf32gerpp (&acc5, rowB[i+1], rowA[j+2]); \
+ __builtin_mma_xvf32gerpp (&acc6, rowB[i], rowA[j+3]); \
+ __builtin_mma_xvf32gerpp (&acc7, rowB[i+1], rowA[j+3]);
+#define SET_ACC_ZERO4() \
+ __builtin_mma_xxsetaccz (&acc0); \
+ __builtin_mma_xxsetaccz (&acc1); \
+ __builtin_mma_xxsetaccz (&acc2); \
+ __builtin_mma_xxsetaccz (&acc3);
+
+#define SET_ACC_ZERO8() \
+ __builtin_mma_xxsetaccz (&acc0); \
+ __builtin_mma_xxsetaccz (&acc1); \
+ __builtin_mma_xxsetaccz (&acc2); \
+ __builtin_mma_xxsetaccz (&acc3); \
+ __builtin_mma_xxsetaccz (&acc4); \
+ __builtin_mma_xxsetaccz (&acc5); \
+ __builtin_mma_xxsetaccz (&acc6); \
+ __builtin_mma_xxsetaccz (&acc7);
+
+#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+#define REFRESH_TEMP_BK(x, y) \
+ temp = k - off;
+#elif defined(LEFT)
+#define REFRESH_TEMP_BK(x, y) \
+ temp = off + x;
+#else
+#define REFRESH_TEMP_BK(x, y) \
+ temp = off + y;
+#endif
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+#define REFRESH_POINTERS(x, y) \
+ BO = B; \
+ REFRESH_TEMP_BK(x, y)
+#else
+#define REFRESH_POINTERS(x, y) \
+ AO += off * x; \
+ BO = B + off * y; \
+ REFRESH_TEMP_BK(x, y)
+#endif
+
+#ifdef LEFT
+#define REFRESH_OFF(x) \
+ off += x;
+#else
+#define REFRESH_OFF(x)
+#endif
+
+#ifdef LEFT
+#define UPDATE_TEMP(x, y) \
+ temp -= x;
+#else
+#define UPDATE_TEMP(x, y) \
+ temp -= y;
+#endif
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+#define REFRESH_TMP_AFTER_SAVE(x, y) \
+ temp = k - off; \
+ UPDATE_TEMP(x, y) \
+ AO += temp * x; \
+ BO += temp * y;
+#else
+#define REFRESH_TMP_AFTER_SAVE(x, y)
+#endif
+
+#define REFRESH_AFTER_SAVE(x,y) \
+ REFRESH_TMP_AFTER_SAVE(x, y) \
+ REFRESH_OFF(x)
+/*************************************************************************************
+* GEMM Kernel
+*************************************************************************************/
+int
+CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
+ FLOAT * C, BLASLONG ldc
+#ifdef TRMMKERNEL
+ , BLASLONG offset
+#endif
+ )
+{
+ BLASLONG N = n;
+ BLASLONG i1;
+#if defined(TRMMKERNEL)
+ BLASLONG off;
+#endif
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ off = -offset;
+#endif
+
+ v4sf_t valpha = { alpha, alpha, alpha, alpha };
+ N = n >> 3;
+ for (i1 = 0; i1 < N; i1++)
+ {
+ BLASLONG i, j, temp;
+ FLOAT *CO;
+ FLOAT *AO;
+#if defined(TRMMKERNEL) && defined(LEFT)
+ off = offset;
+#endif
+ CO = C;
+ C += ldc << 3;
+ AO = A;
+ PREFETCH1 (A, 128);
+ PREFETCH1 (A, 256);
+ i = m >> 4;
+ for (j = 0; j < i; j++)
+ {
+ FLOAT *BO;
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS (16, 8);
+#else
+ BO = B;
+ temp = k;
+#endif
+ v4sf_t *rowC;
+ v4sf_t result[4];
+ __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
+ SET_ACC_ZERO8 ();
+ BLASLONG l = 0;
+ BLASLONG K = temp / 64;
+ for (l = 0; l < K; l++)
+ {
+ vec_t *rowA = (vec_t *) & AO[0];
+ vec_t *rowB = (vec_t *) & BO[0];
+ KERNEL (0, 0);
+ KERNEL (2, 4);
+ KERNEL (4, 8);
+ KERNEL (6, 12);
+ KERNEL (8, 16);
+ KERNEL (10, 20);
+ KERNEL (12, 24);
+ KERNEL (14, 28);
+ KERNEL (16, 32);
+ KERNEL (18, 36);
+ KERNEL (20, 40);
+ KERNEL (22, 44);
+ KERNEL (24, 48);
+ KERNEL (26, 52);
+ KERNEL (28, 56);
+ KERNEL (30, 60);
+ KERNEL (32, 64);
+ KERNEL (34, 68);
+ KERNEL (36, 72);
+ KERNEL (38, 76);
+ KERNEL (40, 80);
+ KERNEL (42, 84);
+ KERNEL (44, 88);
+ KERNEL (46, 92);
+ KERNEL (48, 96);
+ KERNEL (50, 100);
+ KERNEL (52, 104);
+ KERNEL (54, 108);
+ KERNEL (56, 112);
+ KERNEL (58, 116);
+ KERNEL (60, 120);
+ KERNEL (62, 124);
+ KERNEL (64, 128);
+ KERNEL (66, 132);
+ KERNEL (68, 136);
+ KERNEL (70, 140);
+ KERNEL (72, 144);
+ KERNEL (74, 148);
+ KERNEL (76, 152);
+ KERNEL (78, 156);
+ KERNEL (80, 160);
+ KERNEL (82, 164);
+ KERNEL (84, 168);
+ KERNEL (86, 172);
+ KERNEL (88, 176);
+ KERNEL (90, 180);
+ KERNEL (92, 184);
+ KERNEL (94, 188);
+ KERNEL (96, 192);
+ KERNEL (98, 196);
+ KERNEL (100, 200);
+ KERNEL (102, 204);
+ KERNEL (104, 208);
+ KERNEL (106, 212);
+ KERNEL (108, 216);
+ KERNEL (110, 220);
+ KERNEL (112, 224);
+ KERNEL (114, 228);
+ KERNEL (116, 232);
+ KERNEL (118, 236);
+ KERNEL (120, 240);
+ KERNEL (122, 244);
+ KERNEL (124, 248);
+ KERNEL (126, 252);
+ AO += 1024;
+ BO += 512;
+ }
+ if ((temp & 63) >> 5)
+ {
+ vec_t *rowA = (vec_t *) & AO[0];
+ vec_t *rowB = (vec_t *) & BO[0];
+ KERNEL (0, 0);
+ KERNEL (2, 4);
+ KERNEL (4, 8);
+ KERNEL (6, 12);
+ KERNEL (8, 16);
+ KERNEL (10, 20);
+ KERNEL (12, 24);
+ KERNEL (14, 28);
+ KERNEL (16, 32);
+ KERNEL (18, 36);
+ KERNEL (20, 40);
+ KERNEL (22, 44);
+ KERNEL (24, 48);
+ KERNEL (26, 52);
+ KERNEL (28, 56);
+ KERNEL (30, 60);
+ KERNEL (32, 64);
+ KERNEL (34, 68);
+ KERNEL (36, 72);
+ KERNEL (38, 76);
+ KERNEL (40, 80);
+ KERNEL (42, 84);
+ KERNEL (44, 88);
+ KERNEL (46, 92);
+ KERNEL (48, 96);
+ KERNEL (50, 100);
+ KERNEL (52, 104);
+ KERNEL (54, 108);
+ KERNEL (56, 112);
+ KERNEL (58, 116);
+ KERNEL (60, 120);
+ KERNEL (62, 124);
+ AO += 512;
+ BO += 256;
+ }
+ if ((temp & 31) >> 4)
+ {
+ vec_t *rowA = (vec_t *) & AO[0];
+ vec_t *rowB = (vec_t *) & BO[0];
+ KERNEL (0, 0);
+ KERNEL (2, 4);
+ KERNEL (4, 8);
+ KERNEL (6, 12);
+ KERNEL (8, 16);
+ KERNEL (10, 20);
+ KERNEL (12, 24);
+ KERNEL (14, 28);
+ KERNEL (16, 32);
+ KERNEL (18, 36);
+ KERNEL (20, 40);
+ KERNEL (22, 44);
+ KERNEL (24, 48);
+ KERNEL (26, 52);
+ KERNEL (28, 56);
+ KERNEL (30, 60);
+ AO += 256;
+ BO += 128;
+ }
+ if ((temp & 15) >> 3)
+ {
+ vec_t *rowA = (vec_t *) & AO[0];
+ vec_t *rowB = (vec_t *) & BO[0];
+ KERNEL (0, 0);
+ KERNEL (2, 4);
+ KERNEL (4, 8);
+ KERNEL (6, 12);
+ KERNEL (8, 16);
+ KERNEL (10, 20);
+ KERNEL (12, 24);
+ KERNEL (14, 28);
+ AO += 128;
+ BO += 64;
+ }
+ if ((temp & 7) >> 2)
+ {
+ vec_t *rowA = (vec_t *) & AO[0];
+ vec_t *rowB = (vec_t *) & BO[0];
+ KERNEL (0, 0);
+ KERNEL (2, 4);
+ KERNEL (4, 8);
+ KERNEL (6, 12);
+ AO += 64;
+ BO += 32;
+ }
+ if ((temp & 3) >> 1)
+ {
+ vec_t *rowA = (vec_t *) & AO[0];
+ vec_t *rowB = (vec_t *) & BO[0];
+ KERNEL (0, 0);
+ KERNEL (2, 4);
+ AO += 32;
+ BO += 16;
+ }
+ if ((temp & 1) >> 0)
+ {
+ vec_t *rowA = (vec_t *) & AO[0];
+ vec_t *rowB = (vec_t *) & BO[0];
+ KERNEL (0, 0);
+ AO += 16;
+ BO += 8;
+ }
+ SAVE_ACC (&acc0, 0);
+ SAVE_ACC (&acc2, 4);
+ SAVE_ACC1 (&acc1, 0);
+ SAVE_ACC1 (&acc3, 4);
+ SAVE_ACC (&acc4, 8);
+ SAVE_ACC (&acc6, 12);
+ SAVE_ACC1 (&acc5, 8);
+ SAVE_ACC1 (&acc7, 12);
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE (16, 8)
+#endif
+ CO += 16;
+ }
+ i = (m & 15) >> 3;
+ for (j = 0; j < i; j++)
+ {
+ FLOAT *BO;
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS (8, 8);
+#else
+ BO = B;
+ temp = k;
+#endif
+ v4sf_t *rowC;
+ v4sf_t result[4];
+ __vector_quad acc0, acc1, acc2, acc3;
+ SET_ACC_ZERO4 ();
+ BLASLONG l = 0;
+ for (l = 0; l < temp; l++)
+ {
+ vec_t *rowA = (vec_t *) & AO[l << 3];
+ vec_t *rowB = (vec_t *) & BO[l << 3];
+ __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
+ __builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]);
+ __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[1]);
+ __builtin_mma_xvf32gerpp (&acc3, rowB[1], rowA[1]);
+ }
+ SAVE_ACC (&acc0, 0);
+ SAVE_ACC (&acc2, 4);
+ SAVE_ACC1 (&acc1, 0);
+ SAVE_ACC1 (&acc3, 4);
+ AO += (temp << 3);
+ BO += (temp << 3);
+ CO += 8;
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE (8, 8)
+#endif
+ }
+ i = (m & 7) >> 2;
+ for (j = 0; j < i; j++)
+ {
+ FLOAT *BO;
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS (4, 8);
+#else
+ BO = B;
+ temp = k;
+#endif
+ v4sf_t *rowC;
+ v4sf_t result[4];
+ __vector_quad acc0, acc1;
+ __builtin_mma_xxsetaccz (&acc0);
+ __builtin_mma_xxsetaccz (&acc1);
+ BLASLONG l = 0;
+ for (l = 0; l < temp; l++)
+ {
+ vec_t *rowA = (vec_t *) & AO[l << 2];
+ vec_t *rowB = (vec_t *) & BO[l << 3];
+ __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
+ __builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]);
+ }
+ SAVE_ACC (&acc0, 0);
+ SAVE_ACC1 (&acc1, 0);
+ CO += 4;
+ AO += (temp << 2);
+ BO += (temp << 3);
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE (4, 8)
+#endif
+ }
+ i = (m & 3) >> 1;
+ for (j = 0; j < i; j++)
+ {
+ FLOAT *BO;
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS (2, 8);
+#else
+ BO = B;
+ temp = k;
+#endif
+
+ v2sf_t *rowC;
+ v2sf_t result[8];
+ __vector_quad acc0, acc1;
+ __builtin_mma_xxsetaccz (&acc0);
+ __builtin_mma_xxsetaccz (&acc1);
+ BLASLONG l = 0;
+ for (l = 0; l < temp; l++)
+ {
+ FLOAT t[4] = { 0 };
+ t[0] = AO[l << 1], t[1] = AO[(l << 1) + 1];
+ vec_t *rowA = (vec_t *) & t[0];
+ vec_t *rowB = (vec_t *) & BO[l << 3];
+ __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
+ __builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]);
+ }
+ SAVE4x2_ACC (&acc0, 0);
+ SAVE4x2_ACC1 (&acc1, 0);
+ CO += 2;
+ AO += (temp << 1);
+ BO += (temp << 3);
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE (2, 8)
+#endif
+ }
+ i = (m & 1) >> 0;
+ for (j = 0; j < i; j++)
+ {
+ FLOAT *BO;
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS (1, 8);
+#else
+ BO = B;
+ temp = k;
+#endif
+ BLASLONG l = 0;
+ v4sf_t t = { 0, 0, 0, 0 };
+ v4sf_t t1 = { 0, 0, 0, 0 };
+ for (l = 0; l < temp; l++)
+ {
+ v4sf_t rowA = { AO[l], AO[l], AO[l], AO[l] };
+ v4sf_t rowB = { BO[l << 3], BO[(l << 3) + 1], BO[(l << 3) + 2],
+ BO[(l << 3) + 3]
+ };
+ v4sf_t rowB1 =
+ { BO[(l << 3) + 4], BO[(l << 3) + 5], BO[(l << 3) + 6],
+ BO[(l << 3) + 7]
+ };
+ t += rowA * rowB;
+ t1 += rowA * rowB1;
+ }
+ t = t * valpha;
+ t1 = t1 * valpha;
+#if defined(TRMMKERNEL)
+ CO[0 * ldc] = t[0];
+ CO[1 * ldc] = t[1];
+ CO[2 * ldc] = t[2];
+ CO[3 * ldc] = t[3];
+ CO[4 * ldc] = t1[0];
+ CO[5 * ldc] = t1[1];
+ CO[6 * ldc] = t1[2];
+ CO[7 * ldc] = t1[3];
+#else
+ CO[0 * ldc] += t[0];
+ CO[1 * ldc] += t[1];
+ CO[2 * ldc] += t[2];
+ CO[3 * ldc] += t[3];
+ CO[4 * ldc] += t1[0];
+ CO[5 * ldc] += t1[1];
+ CO[6 * ldc] += t1[2];
+ CO[7 * ldc] += t1[3];
+#endif
+ CO += 1;
+ AO += temp;
+ BO += (temp << 3);
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE (1, 8)
+#endif
+ }
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ off += 8; // number of values in A
+#endif
+
+ B += k << 3;
+ }
+ N = (n & 7) >> 2;
+ for (i1 = 0; i1 < N; i1++)
+ {
+ BLASLONG i, j, temp;
+#if defined(TRMMKERNEL) && defined(LEFT)
+ off = offset;
+#endif
+ FLOAT *CO;
+ FLOAT *AO;
+ CO = C;
+ C += ldc << 2;
+ AO = A;
+#if !defined(TRMMKERNEL)
+ i = m >> 5;
+ for (j = 0; j < i; j++)
+ {
+ FLOAT *BO = B;
+ v4sf_t *rowC;
+ v4sf_t result[4];
+ FLOAT *A1;
+ A1 = AO + (16 * k);
+ __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
+ SET_ACC_ZERO8 ();
+ BLASLONG l = 0;
+ for (l = 0; l < k; l++)
+ {
+ vec_t *rowA = (vec_t *) & AO[l << 4];
+ vec_t *rowA1 = (vec_t *) & A1[l << 4];
+ vec_t *rowB = (vec_t *) & BO[l << 2];
+ __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
+ __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
+ __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]);
+ __builtin_mma_xvf32gerpp (&acc3, rowB[0], rowA[3]);
+ __builtin_mma_xvf32gerpp (&acc4, rowB[0], rowA1[0]);
+ __builtin_mma_xvf32gerpp (&acc5, rowB[0], rowA1[1]);
+ __builtin_mma_xvf32gerpp (&acc6, rowB[0], rowA1[2]);
+ __builtin_mma_xvf32gerpp (&acc7, rowB[0], rowA1[3]);
+ }
+
+ SAVE_ACC (&acc0, 0);
+ SAVE_ACC (&acc1, 4);
+ CO += 8;
+ SAVE_ACC (&acc2, 0);
+ SAVE_ACC (&acc3, 4);
+ CO += 8;
+ SAVE_ACC (&acc4, 0);
+ SAVE_ACC (&acc5, 4);
+ CO += 8;
+ SAVE_ACC (&acc6, 0);
+ SAVE_ACC (&acc7, 4);
+ CO += 8;
+ AO += k << 5;
+ BO += k << 2;
+ }
+ i = (m & 31) >> 4;
+#else
+ i = m >> 4;
+#endif
+ for (j = 0; j < i; j++)
+ {
+ FLOAT *BO;
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS (16, 4);
+#else
+ BO = B;
+ temp = k;
+#endif
+ v4sf_t *rowC;
+ v4sf_t result[4];
+ __vector_quad acc0, acc1, acc2, acc3;
+ SET_ACC_ZERO4 ();
+ BLASLONG l = 0;
+ for (l = 0; l < temp; l++)
+ {
+ vec_t *rowA = (vec_t *) & AO[l << 4];
+ vec_t *rowB = (vec_t *) & BO[l << 2];
+ __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
+ __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
+ __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]);
+ __builtin_mma_xvf32gerpp (&acc3, rowB[0], rowA[3]);
+ }
+
+ SAVE_ACC (&acc0, 0);
+ SAVE_ACC (&acc1, 4);
+ CO += 8;
+ SAVE_ACC (&acc2, 0);
+ SAVE_ACC (&acc3, 4);
+ CO += 8;
+ AO += temp << 4;
+ BO += temp << 2;
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE (16, 4)
+#endif
+ }
+ i = (m & 15) >> 3;
+ for (j = 0; j < i; j++)
+ {
+ FLOAT *BO;
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS (8, 4);
+#else
+ BO = B;
+ temp = k;
+#endif
+ v4sf_t *rowC;
+ v4sf_t result[4];
+ __vector_quad acc0, acc1;
+ __builtin_mma_xxsetaccz (&acc0);
+ __builtin_mma_xxsetaccz (&acc1);
+ BLASLONG l = 0;
+ for (l = 0; l < temp; l++)
+ {
+ vec_t *rowA = (vec_t *) & AO[l << 3];
+ vec_t *rowB = (vec_t *) & BO[l << 2];
+ __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
+ __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
+ }
+ SAVE_ACC (&acc0, 0);
+ SAVE_ACC (&acc1, 4);
+ CO += 8;
+ AO += temp << 3;
+ BO += temp << 2;
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE (8, 4)
+#endif
+ }
+ i = (m & 7) >> 2;
+ for (j = 0; j < i; j++)
+ {
+ FLOAT *BO;
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS (4, 4);
+#else
+ BO = B;
+ temp = k;
+#endif
+ v4sf_t *rowC;
+ __vector_quad acc0;
+ v4sf_t result[4];
+ __builtin_mma_xxsetaccz (&acc0);
+ BLASLONG l = 0;
+ for (l = 0; l < temp; l++)
+ {
+ vec_t *rowA = (vec_t *) & AO[l << 2];
+ vec_t *rowB = (vec_t *) & BO[l << 2];
+ __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
+ }
+ SAVE_ACC (&acc0, 0);
+ CO += 4;
+ AO += temp << 2;
+ BO += temp << 2;
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE (4, 4)
+#endif
+ }
+ i = (m & 3) >> 1;
+ for (j = 0; j < i; j++)
+ {
+ FLOAT *BO;
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS (2, 4);
+#else
+ BO = B;
+ temp = k;
+#endif
+ v2sf_t *rowC;
+ v2sf_t result[8];
+ __vector_quad acc0;
+ __builtin_mma_xxsetaccz (&acc0);
+ BLASLONG l = 0;
+ for (l = 0; l < temp; l++)
+ {
+ FLOAT t[4] = { 0 };
+ t[0] = AO[l << 1], t[1] = AO[(l << 1) + 1];
+ vec_t *rowA = (vec_t *) & t[0];
+ vec_t *rowB = (vec_t *) & BO[l << 2];
+ __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
+ }
+ SAVE4x2_ACC (&acc0, 0);
+ CO += 2;
+ AO += temp << 1;
+ BO += temp << 2;
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE (2, 4)
+#endif
+ }
+ i = (m & 1) >> 0;
+ for (j = 0; j < i; j++)
+ {
+ FLOAT *BO;
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS (1, 4)
+#else
+ BO = B;
+ temp = k;
+#endif
+ BLASLONG l = 0;
+ v4sf_t t = { 0, 0, 0, 0 };
+ for (l = 0; l < temp; l++)
+ {
+ v4sf_t rowA = { AO[l], AO[l], AO[l], AO[l] };
+ v4sf_t rowB = { BO[l << 2], BO[(l << 2) + 1], BO[(l << 2) + 2],
+ BO[(l << 2) + 3]
+ };
+ t += rowA * rowB;
+ }
+ t = t * valpha;
+#if defined(TRMMKERNEL)
+ CO[0 * ldc] = t[0];
+ CO[1 * ldc] = t[1];
+ CO[2 * ldc] = t[2];
+ CO[3 * ldc] = t[3];
+#else
+ CO[0 * ldc] += t[0];
+ CO[1 * ldc] += t[1];
+ CO[2 * ldc] += t[2];
+ CO[3 * ldc] += t[3];
+#endif
+ CO += 1;
+ AO += temp;
+ BO += temp << 2;
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE (1, 4)
+#endif
+ }
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ off += 4; // number of values in A
+#endif
+
+ B += k << 2;
+ }
+ N = (n & 3) >> 1;
+ for (i1 = 0; i1 < N; i1++)
+ {
+ BLASLONG i, j, temp;
+#if defined(TRMMKERNEL) && defined(LEFT)
+ off = offset;
+#endif
+ FLOAT *CO;
+ FLOAT *AO;
+ CO = C;
+ C += ldc << 1;
+ AO = A;
+#if !defined(TRMMKERNEL)
+ i = m >> 5;
+ for (j = 0; j < i; j++)
+ {
+ FLOAT *BO = B;
+ v4sf_t *rowC;
+ v4sf_t result[4];
+ FLOAT *A1;
+ A1 = AO + (16 * k);
+ __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
+ SET_ACC_ZERO8 ();
+ BLASLONG l = 0;
+ for (l = 0; l < k; l++)
+ {
+ FLOAT t[4] = { 0 };
+ t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
+ vec_t *rowB = (vec_t *) & t[0];
+ vec_t *rowA = (vec_t *) & AO[l << 4];
+ vec_t *rowA1 = (vec_t *) & A1[l << 4];
+ __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
+ __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
+ __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]);
+ __builtin_mma_xvf32gerpp (&acc3, rowB[0], rowA[3]);
+ __builtin_mma_xvf32gerpp (&acc4, rowB[0], rowA1[0]);
+ __builtin_mma_xvf32gerpp (&acc5, rowB[0], rowA1[1]);
+ __builtin_mma_xvf32gerpp (&acc6, rowB[0], rowA1[2]);
+ __builtin_mma_xvf32gerpp (&acc7, rowB[0], rowA1[3]);
+ }
+ SAVE2x4_ACC (&acc0, 0);
+ SAVE2x4_ACC (&acc1, 4);
+ SAVE2x4_ACC (&acc2, 8);
+ SAVE2x4_ACC (&acc3, 12);
+ CO += 16;
+ SAVE2x4_ACC (&acc4, 0);
+ SAVE2x4_ACC (&acc5, 4);
+ SAVE2x4_ACC (&acc6, 8);
+ SAVE2x4_ACC (&acc7, 12);
+ CO += 16;
+ AO += k << 5;
+ BO += k << 1;
+ }
+ i = (m & 31) >> 4;
+#else
+ i = m >> 4;
+#endif
+ for (j = 0; j < i; j++)
+ {
+ FLOAT *BO;
+ v4sf_t *rowC;
+ v4sf_t result[4];
+ __vector_quad acc0, acc1, acc2, acc3;
+ SET_ACC_ZERO4 ();
+ BLASLONG l = 0;
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS (16, 2)
+#else
+ BO = B;
+ temp = k;
+#endif
+ for (l = 0; l < temp; l++)
+ {
+ FLOAT t[4] = { 0 };
+ t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
+ vec_t *rowB = (vec_t *) & t[0];
+ vec_t *rowA = (vec_t *) & AO[l << 4];
+ __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
+ __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
+ __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]);
+ __builtin_mma_xvf32gerpp (&acc3, rowB[0], rowA[3]);
+ }
+ SAVE2x4_ACC (&acc0, 0);
+ SAVE2x4_ACC (&acc1, 4);
+ SAVE2x4_ACC (&acc2, 8);
+ SAVE2x4_ACC (&acc3, 12);
+ CO += 16;
+ AO += temp << 4;
+ BO += temp << 1;
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE (16, 2)
+#endif
+ }
+ i = (m & 15) >> 3;
+ for (j = 0; j < i; j++)
+ {
+ FLOAT *BO;
+ v4sf_t *rowC;
+ v4sf_t result[4];
+ __vector_quad acc0, acc1;
+ __builtin_mma_xxsetaccz (&acc0);
+ __builtin_mma_xxsetaccz (&acc1);
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS (8, 2)
+#else
+ BO = B;
+ temp = k;
+#endif
+ BLASLONG l = 0;
+ for (l = 0; l < temp; l++)
+ {
+ FLOAT t[4] = { 0 };
+ t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
+ vec_t *rowB = (vec_t *) & t[0];
+ vec_t *rowA = (vec_t *) & AO[l << 3];
+ __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
+ __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
+ }
+ SAVE2x4_ACC (&acc0, 0);
+ SAVE2x4_ACC (&acc1, 4);
+ CO += 8;
+ AO += temp << 3;
+ BO += temp << 1;
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE (8, 2)
+#endif
+ }
+ i = (m & 7) >> 2;
+ for (j = 0; j < i; j++)
+ {
+ FLOAT *BO;
+ v4sf_t *rowC;
+ v4sf_t result[4];
+ __vector_quad acc0;
+ __builtin_mma_xxsetaccz (&acc0);
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS (4, 2)
+#else
+ BO = B;
+ temp = k;
+#endif
+ BLASLONG l = 0;
+ for (l = 0; l < temp; l++)
+ {
+ FLOAT t[4] = { 0 };
+ t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
+ vec_t *rowB = (vec_t *) & t[0];
+ vec_t *rowA = (vec_t *) & AO[l << 2];
+ __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
+ }
+ SAVE2x4_ACC (&acc0, 0);
+ CO += 4;
+ AO += temp << 2;
+ BO += temp << 1;
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE (4, 2)
+#endif
+ }
+ i = (m & 3) >> 1;
+ for (j = 0; j < i; j++)
+ {
+ FLOAT *BO;
+ BLASLONG l = 0;
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS (2, 2)
+#else
+ BO = B;
+ temp = k;
+#endif
+ v4sf_t t = { 0, 0, 0, 0 };
+ for (l = 0; l < (temp << 1); l += 2)
+ {
+ v4sf_t rowA = { AO[l], AO[l], AO[l + 1], AO[l + 1] };
+ v4sf_t rowB = { BO[l], BO[l + 1], BO[l], BO[l + 1] };
+ t += rowA * rowB;
+ }
+ t = t * valpha;
+#if defined(TRMMKERNEL)
+ CO[0 * ldc] = t[0];
+ CO[1 * ldc] = t[1];
+ CO[0 * ldc + 1] = t[2];
+ CO[1 * ldc + 1] = t[3];
+#else
+ CO[0 * ldc] += t[0];
+ CO[1 * ldc] += t[1];
+ CO[0 * ldc + 1] += t[2];
+ CO[1 * ldc + 1] += t[3];
+#endif
+ CO += 2;
+ AO += temp << 1;
+ BO += temp << 1;
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE (2, 2)
+#endif
+ }
+ i = (m & 1) >> 0;
+ for (j = 0; j < i; j++)
+ {
+ FLOAT *BO;
+ BLASLONG l = 0;
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS (1, 2)
+#else
+ BO = B;
+ temp = k;
+#endif
+ v4sf_t t = { 0, 0, 0, 0 };
+ for (l = 0; l < temp; l++)
+ {
+ v4sf_t rowA = { AO[l], AO[l], 0, 0 };
+ v4sf_t rowB = { BO[l << 1], BO[(l << 1) + 1], 0, 0 };
+ t += rowA * rowB;
+ }
+ t = t * valpha;
+#if defined(TRMMKERNEL)
+ CO[0 * ldc] = t[0];
+ CO[1 * ldc] = t[1];
+#else
+ CO[0 * ldc] += t[0];
+ CO[1 * ldc] += t[1];
+#endif
+ CO += 1;
+ AO += temp;
+ BO += temp << 1;
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE (1, 2)
+#endif
+ }
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ off += 2; // number of values in A
+#endif
+
+ B += k << 1;
+ }
+ N = (n & 1) >> 0;
+ for (i1 = 0; i1 < N; i1++)
+ {
+ BLASLONG i, temp;
+#if defined(TRMMKERNEL) && defined(LEFT)
+ off = offset;
+#endif
+ FLOAT *CO;
+ FLOAT *AO;
+ CO = C;
+ C += ldc;
+ AO = A;
+ i = m;
+ while (i >= 16)
+ {
+ FLOAT *BO;
+ BLASLONG l = 0;
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS (16, 1)
+#else
+ BO = B;
+ temp = k;
+#endif
+
+ v4sf_t t = { 0, 0, 0, 0 };
+ v4sf_t t1 = { 0, 0, 0, 0 };
+ v4sf_t t2 = { 0, 0, 0, 0 };
+ v4sf_t t3 = { 0, 0, 0, 0 };
+ for (l = 0; l < temp; l++)
+ {
+ v4sf_t rowB = { BO[l], BO[l], BO[l], BO[l] };
+ v4sf_t rowA = { AO[l << 4], AO[(l << 4) + 1], AO[(l << 4) + 2],
+ AO[(l << 4) + 3]
+ };
+ v4sf_t rowA1 =
+ { AO[(l << 4) + 4], AO[(l << 4) + 5], AO[(l << 4) + 6],
+ AO[(l << 4) + 7]
+ };
+ v4sf_t rowA2 =
+ { AO[(l << 4) + 8], AO[(l << 4) + 9], AO[(l << 4) + 10],
+ AO[(l << 4) + 11]
+ };
+ v4sf_t rowA3 =
+ { AO[(l << 4) + 12], AO[(l << 4) + 13], AO[(l << 4) + 14],
+ AO[(l << 4) + 15]
+ };
+ t += rowA * rowB;
+ t1 += rowA1 * rowB;
+ t2 += rowA2 * rowB;
+ t3 += rowA3 * rowB;
+ }
+ t = t * valpha;
+ t1 = t1 * valpha;
+ t2 = t2 * valpha;
+ t3 = t3 * valpha;
+#if defined(TRMMKERNEL)
+ CO[0] = t[0];
+ CO[1] = t[1];
+ CO[2] = t[2];
+ CO[3] = t[3];
+ CO[4] = t1[0];
+ CO[5] = t1[1];
+ CO[6] = t1[2];
+ CO[7] = t1[3];
+ CO[8] = t2[0];
+ CO[9] = t2[1];
+ CO[10] = t2[2];
+ CO[11] = t2[3];
+ CO[12] = t3[0];
+ CO[13] = t3[1];
+ CO[14] = t3[2];
+ CO[15] = t3[3];
+#else
+ CO[0] += t[0];
+ CO[1] += t[1];
+ CO[2] += t[2];
+ CO[3] += t[3];
+ CO[4] += t1[0];
+ CO[5] += t1[1];
+ CO[6] += t1[2];
+ CO[7] += t1[3];
+ CO[8] += t2[0];
+ CO[9] += t2[1];
+ CO[10] += t2[2];
+ CO[11] += t2[3];
+ CO[12] += t3[0];
+ CO[13] += t3[1];
+ CO[14] += t3[2];
+ CO[15] += t3[3];
+#endif
+ AO += temp << 4;
+ BO += temp;
+ CO += 16;
+ i -= 16;
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE (16, 1)
+#endif
+ }
+ while (i >= 8)
+ {
+ FLOAT *BO;
+ BLASLONG l = 0;
+ v4sf_t t = { 0, 0, 0, 0 };
+ v4sf_t t1 = { 0, 0, 0, 0 };
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS (8, 1)
+#else
+ BO = B;
+ temp = k;
+#endif
+
+ for (l = 0; l < temp; l++)
+ {
+ v4sf_t rowB = { BO[l], BO[l], BO[l], BO[l] };
+ v4sf_t rowA = { AO[l << 3], AO[(l << 3) + 1], AO[(l << 3) + 2],
+ AO[(l << 3) + 3]
+ };
+ v4sf_t rowA1 =
+ { AO[(l << 3) + 4], AO[(l << 3) + 5], AO[(l << 3) + 6],
+ AO[(l << 3) + 7]
+ };
+ t += rowA * rowB;
+ t1 += rowA1 * rowB;
+ }
+ t = t * valpha;
+ t1 = t1 * valpha;
+#if defined(TRMMKERNEL)
+ CO[0] = t[0];
+ CO[1] = t[1];
+ CO[2] = t[2];
+ CO[3] = t[3];
+ CO[4] = t1[0];
+ CO[5] = t1[1];
+ CO[6] = t1[2];
+ CO[7] = t1[3];
+#else
+ CO[0] += t[0];
+ CO[1] += t[1];
+ CO[2] += t[2];
+ CO[3] += t[3];
+ CO[4] += t1[0];
+ CO[5] += t1[1];
+ CO[6] += t1[2];
+ CO[7] += t1[3];
+#endif
+ AO += temp << 3;
+ BO += temp;
+ CO += 8;
+ i -= 8;
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE (8, 1)
+#endif
+ }
+ while (i >= 4)
+ {
+ FLOAT *BO;
+ BLASLONG l = 0;
+ v4sf_t t = { 0, 0, 0, 0 };
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS (4, 1)
+#else
+ BO = B;
+ temp = k;
+#endif
+
+ for (l = 0; l < temp; l++)
+ {
+ v4sf_t rowB = { BO[l], BO[l], BO[l], BO[l] };
+ v4sf_t rowA = { AO[l << 2], AO[(l << 2) + 1], AO[(l << 2) + 2],
+ AO[(l << 2) + 3]
+ };
+ t += rowA * rowB;
+ }
+ t = t * valpha;
+#if defined(TRMMKERNEL)
+ CO[0] = t[0];
+ CO[1] = t[1];
+ CO[2] = t[2];
+ CO[3] = t[3];
+#else
+ CO[0] += t[0];
+ CO[1] += t[1];
+ CO[2] += t[2];
+ CO[3] += t[3];
+#endif
+ AO += temp << 2;
+ BO += temp;
+ CO += 4;
+ i -= 4;
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE (4, 1)
+#endif
+ }
+ while (i >= 2)
+ {
+ FLOAT *BO;
+ BLASLONG l = 0;
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS (2, 1)
+#else
+ BO = B;
+ temp = k;
+#endif
+
+ v4sf_t t = { 0, 0, 0, 0 };
+ for (l = 0; l < temp; l++)
+ {
+ v4sf_t rowB = { BO[l], BO[l], 0, 0 };
+ v4sf_t rowA = { AO[l << 1], AO[(l << 1) + 1], 0, 0 };
+ t += rowA * rowB;
+ }
+ t = t * valpha;
+#if defined(TRMMKERNEL)
+ CO[0] = t[0];
+ CO[1] = t[1];
+#else
+ CO[0] += t[0];
+ CO[1] += t[1];
+#endif
+ AO += temp << 1;
+ BO += temp;
+ CO += 2;
+ i -= 2;
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE (2, 1)
+#endif
+ }
+ while (i >= 1)
+ {
+ FLOAT *BO;
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS (1, 1)
+#else
+ BO = B;
+ temp = k;
+#endif
+
+ BLASLONG l = 0;
+ FLOAT t = 0;
+ for (l = 0; l < temp; l++)
+ {
+ t += AO[l] * BO[l];
+ }
+ AO += temp;
+ BO += temp;
+#if defined(TRMMKERNEL)
+ CO[0] = t * alpha;
+#else
+ CO[0] += t * alpha;
+#endif
+ CO += 1;
+ i -= 1;
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE (1, 1)
+#endif
+ }
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ off += 1; // number of values in A
+#endif
+ B += k;
+ }
+ return 0;
+}