powerpc: Optimized SGEMM/DGEMM/CGEMM for POWER10

author Rajalakshmi Srinivasaraghavan <rajis@linux.ibm.com>

Wed, 24 Jun 2020 19:48:15 +0000 (14:48 -0500)

committer Rajalakshmi Srinivasaraghavan <rajis@linux.ibm.com>

Wed, 24 Jun 2020 19:48:15 +0000 (14:48 -0500)
author Rajalakshmi Srinivasaraghavan <rajis@linux.ibm.com>
Wed, 24 Jun 2020 19:48:15 +0000 (14:48 -0500)
committer Rajalakshmi Srinivasaraghavan <rajis@linux.ibm.com>
Wed, 24 Jun 2020 19:48:15 +0000 (14:48 -0500)
diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10

index ab8fbfc..00d31f8 100644 (file)
--- a/kernel/power/KERNEL.POWER10
+++ b/kernel/power/KERNEL.POWER10
@@ -7,12 +7,12 @@ else
  #CGEMM_BETA = ../generic/zgemm_beta.c
  #ZGEMM_BETA = ../generic/zgemm_beta.c
  
-STRMMKERNEL    = sgemm_kernel_power9.S
-DTRMMKERNEL    = dgemm_kernel_power9.S
-CTRMMKERNEL    = cgemm_kernel_power9.S
+STRMMKERNEL    = sgemm_kernel_power10.c
+DTRMMKERNEL    = dgemm_kernel_power10.c
+CTRMMKERNEL    = cgemm_kernel_power10.S
  ZTRMMKERNEL    = zgemm_kernel_power9.S
  
-SGEMMKERNEL    =  sgemm_kernel_power9.S
+SGEMMKERNEL    =  sgemm_kernel_power10.c
  SGEMMINCOPY    = ../generic/gemm_ncopy_16.c
  SGEMMITCOPY    = sgemm_tcopy_16_power8.S
  SGEMMONCOPY    =  ../generic/gemm_ncopy_8.c
@@ -22,7 +22,7 @@ SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
  SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
  SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
  
-DGEMMKERNEL    =  dgemm_kernel_power9.S
+DGEMMKERNEL    =  dgemm_kernel_power10.c
  DGEMMINCOPY    = ../generic/gemm_ncopy_16.c
  DGEMMITCOPY    =  dgemm_tcopy_16_power8.S
  DGEMMONCOPY    =  dgemm_ncopy_4_power8.S
@@ -32,7 +32,7 @@ DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
  DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
  DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
  
-CGEMMKERNEL    = cgemm_kernel_power9.S
+CGEMMKERNEL    = cgemm_kernel_power10.S
  CGEMMINCOPY    = ../generic/zgemm_ncopy_8.c
  CGEMMITCOPY    = ../generic/zgemm_tcopy_8.c
  CGEMMONCOPY    = ../generic/zgemm_ncopy_4.c
diff --git a/kernel/power/cgemm_kernel_power10.S b/kernel/power/cgemm_kernel_power10.S

new file mode 100644 (file)

index 0000000..e04f948
--- /dev/null
+++ b/kernel/power/cgemm_kernel_power10.S
@@ -0,0 +1,286 @@
+/***************************************************************************
+Copyright (c) 2013-2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+ 
+#define LOAD   ld
+#define STACKSIZE  (512 )  
+#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */  
+#define        M       r3
+#define        N       r4
+#define        K       r5
+
+
+#define A      r8
+#define        B       r9
+#define        C       r10
+#define        LDC     r6
+#define OFFSET r7
+
+
+#define alpha_r vs51
+#define alpha_i vs55
+#define save_permute_1 vs59
+#define permute_mask vs63
+#define o0     0
+ 
+
+#define T1     r11
+#define T2     r12
+#define T3     r14
+#define T4     r15
+#define T5     r16
+#define T6     r17
+#define L      r18
+#define T7     r19
+#define T8     r20
+#define TEMP_REG       r21
+#define        I       r22
+#define J      r23
+#define AO     r24
+#define        BO      r25
+#define        CO      r26
+#define T9     r27
+#define        T10     r28
+#define        PRE     r29
+
+#define T12    r30
+#define T13    r31
+
+#include "cgemm_macros_power10.S"
+
+.equ    perm_const1, 0x0405060700010203
+.equ    perm_const2, 0x0c0d0e0f08090a0b
+.equ save_permute_12, 0x0c0d0e0f1c1d1e1f
+.equ save_permute_11, 0x0405060714151617
+
+
+
+#ifndef NEEDPARAM
+
+       PROLOGUE
+       PROFCODE
+
+
+       addi    SP, SP, -STACKSIZE
+       mflr r0
+
+
+       stfd    f14,    0(SP)
+       stfd    f15,    8(SP)
+       stfd    f16,   16(SP)
+       stfd    f17,   24(SP)
+
+       stfd    f18,   32(SP)
+       stfd    f19,   40(SP)
+       stfd    f20,   48(SP)
+       stfd    f21,   56(SP)
+
+       stfd    f22,   64(SP)
+       stfd    f23,   72(SP)
+       stfd    f24,   80(SP)
+       stfd    f25,   88(SP)
+
+       stfd    f26,   96(SP)
+       stfd    f27,  104(SP)
+       stfd    f28,  112(SP)
+       stfd    f29,  120(SP)
+
+       stfd    f30,  128(SP)
+       stfd    f31,  136(SP)
+
+
+       std     r31,  144(SP)
+       std     r30,  152(SP)
+       std     r29,  160(SP)
+       std     r28,  168(SP)
+       std     r27,  176(SP)
+       std     r26,  184(SP)
+       std     r25,  192(SP)
+       std     r24,  200(SP)
+       std     r23,  208(SP)
+       std     r22,  216(SP)
+       std     r21,  224(SP)
+       std     r20,  232(SP)
+       std     r19,  240(SP)
+       std     r18,  248(SP)
+       std     r17,  256(SP)
+       std     r16,  264(SP)
+       std     r15,  272(SP)
+       std     r14,  280(SP)
+ 
+ 
+  stxv    vs52,  288(SP)
+  stxv    vs53,  304(SP)
+  stxv    vs54,  320(SP)
+  stxv    vs55,  336(SP)
+  stxv    vs56,  352(SP)
+  stxv    vs57,  368(SP)
+  stxv    vs58,  384(SP)
+  stxv    vs59,  400(SP)
+  stxv    vs60,  416(SP)
+  stxv    vs61,  432(SP)
+  stxv    vs62,  448(SP)
+  stxv    vs63,  464(SP)
+  std     r0,   FLINK_SAVE(SP)
+ 
+
+
+       ld      LDC, FRAMESLOT(0) + STACKSIZE(SP)
+
+
+
+#ifdef TRMMKERNEL
+       ld      OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
+#endif
+   slwi    LDC, LDC, ZBASE_SHIFT
+
+ 
+ 
+       /*alpha is stored in f1. convert to single and splat*/
+    xscvdpspn alpha_r,vs1 
+    xscvdpspn alpha_i,vs2 
+       xxspltw   alpha_r,alpha_r,0 
+       xxspltw   alpha_i,alpha_i,0 
+/*load reverse permute mask for big endian
+  uint128 = 0xc0d0e0f08090a0b0405060700010203
+*/ 
+               
+       lis T2, perm_const2@highest
+       lis T1, perm_const1@highest
+       lis T3, save_permute_12@highest
+       lis T4, save_permute_11@highest
+
+       
+       ori T2, T2, perm_const2@higher
+       ori T1, T1, perm_const1@higher
+       ori T3, T3, save_permute_12@higher
+       ori T4, T4, save_permute_11@higher
+
+       
+       rldicr T2, T2, 32, 31
+       rldicr T1, T1, 32, 31
+       rldicr T3, T3, 32, 31
+       rldicr T4, T4, 32, 31 
+
+       oris T2, T2, perm_const2@h
+       oris T1, T1, perm_const1@h
+       oris T3, T3, save_permute_12@h
+       oris T4, T4, save_permute_11@h
+
+       
+       ori T2, T2, perm_const2@l  
+       ori T1, T1, perm_const1@l
+       ori T3, T3, save_permute_12@l  
+       ori T4, T4, save_permute_11@l
+
+       
+  li r0,0
+  li PRE,512
+
+#if defined(CC) || defined(CR) || defined(RC) || defined(RR) 
+/*negate for this case as we will use addition -1*(a+b) */
+  xvnegsp alpha_r,alpha_r
+  xvnegsp alpha_i,alpha_i
+#endif
+
+       mtvsrdd permute_mask,T2,T1
+       mtvsrdd save_permute_1,T3,T4    
+
+     /*mask is reverse permute so we have to make it inner permute */
+       xxpermdi        permute_mask,   permute_mask,   permute_mask,2 
+
+#include "cgemm_logic_power10.S"
+
+.L999: 
+       lfd     f14,    0(SP)
+       lfd     f15,    8(SP)
+       lfd     f16,   16(SP)
+       lfd     f17,   24(SP)
+
+       lfd     f18,   32(SP)
+       lfd     f19,   40(SP)
+       lfd     f20,   48(SP)
+       lfd     f21,   56(SP)
+
+       lfd     f22,   64(SP)
+       lfd     f23,   72(SP)
+       lfd     f24,   80(SP)
+       lfd     f25,   88(SP)
+
+       lfd     f26,   96(SP)
+       lfd     f27,  104(SP)
+       lfd     f28,  112(SP)
+       lfd     f29,  120(SP)
+
+       lfd     f30,  128(SP)
+       lfd     f31,  136(SP)
+
+       ld      r31,  144(SP)
+       ld      r30,  152(SP)
+       ld      r29,  160(SP)
+       ld      r28,  168(SP)
+       ld      r27,  176(SP)
+       ld      r26,  184(SP)
+       ld      r25,  192(SP)
+       ld      r24,  200(SP)
+       ld      r23,  208(SP)
+       ld      r22,  216(SP)
+       ld      r21,  224(SP)
+       ld      r20,  232(SP)
+       ld      r19,  240(SP)
+       ld      r18,  248(SP)
+       ld      r17,  256(SP)
+       ld      r16,  264(SP)
+       ld      r15,  272(SP)
+       ld      r14,  280(SP)
+
+       ld    r0,        FLINK_SAVE(SP) 
+ 
+    lxv    vs52,  288(SP)
+    lxv    vs53,  304(SP)
+    lxv    vs54,  320(SP)
+    lxv    vs55,  336(SP)
+    lxv    vs56,  352(SP)
+    lxv    vs57,  368(SP)
+    lxv    vs58,  384(SP) 
+    lxv    vs59,  400(SP)
+       mtlr r0
+    lxv    vs60,  416(SP)
+    lxv    vs61,  432(SP) 
+    lxv    vs62,  448(SP)
+    lxv    vs63,  464(SP)
+
+       addi    SP, SP, STACKSIZE 
+       blr
+
+
+       EPILOGUE
+#endif
diff --git a/kernel/power/cgemm_logic_power10.S b/kernel/power/cgemm_logic_power10.S

new file mode 100644 (file)

index 0000000..3700ac8
--- /dev/null
+++ b/kernel/power/cgemm_logic_power10.S
@@ -0,0 +1,2814 @@
+/***************************************************************************
+Copyright (c) 2013-2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define MY_ALIGN .align 3
+b CGEMM_L4
+/*                MINI SUBROUTINES                            */      
+/*                4x8 MAIN 128x+2 LOOP                     */      
+
+
+CGEMM_L4x8_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD4x8_2 
+    MY_ALIGN
+CGEMM_L4x8_LOOP:
+/*----------------------------------------*/   
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL4x8_L2 128,64,0,0 
+CGEMM_L4x8_K128:
+/*----------------------------------------*/   
+    KERNEL4x8_L2 128,64,1,0
+    dcbt    AO, T2  
+    KERNEL4x8_L2 128,64,2,0
+    KERNEL4x8_L2 128,64,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL4x8_L2 128,64,4,0
+    KERNEL4x8_L2 128,64,5,0
+    dcbt    AO, T4  
+    KERNEL4x8_L2 128,64,6,0
+    KERNEL4x8_L2 128,64,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL4x8_L2 128,64,8,0
+    KERNEL4x8_L2 128,64,9,0
+    KERNEL4x8_L2 128,64,10,0
+    KERNEL4x8_L2 128,64,11,0  
+    dcbt    BO, T4
+    KERNEL4x8_L2 128,64,12,0
+    KERNEL4x8_L2 128,64,13,0
+    KERNEL4x8_L2 128,64,14,0
+    KERNEL4x8_L2 128,64,15,0  
+    KERNEL4x8_L2 128,64,16,0
+    KERNEL4x8_L2 128,64,17,0 
+    KERNEL4x8_L2 128,64,18,0
+    KERNEL4x8_L2 128,64,19,0  
+    KERNEL4x8_L2 128,64,20,0
+    KERNEL4x8_L2 128,64,21,0 
+    KERNEL4x8_L2 128,64,22,0
+    KERNEL4x8_L2 128,64,23,0   
+    KERNEL4x8_L2 128,64,24,0
+    KERNEL4x8_L2 128,64,25,0
+    KERNEL4x8_L2 128,64,26,0
+    KERNEL4x8_L2 128,64,27,0  
+    KERNEL4x8_L2 128,64,28,0
+    KERNEL4x8_L2 128,64,29,0
+    KERNEL4x8_L2 128,64,30,0
+    KERNEL4x8_L2 128,64,31,0 
+    KERNEL4x8_L2 128,64,32,0
+    KERNEL4x8_L2 128,64,33,0
+    KERNEL4x8_L2 128,64,34,0
+    KERNEL4x8_L2 128,64,35,0 
+    KERNEL4x8_L2 128,64,36,0
+    KERNEL4x8_L2 128,64,37,0
+    KERNEL4x8_L2 128,64,38,0
+    KERNEL4x8_L2 128,64,39,0  
+    KERNEL4x8_L2 128,64,40,0
+    KERNEL4x8_L2 128,64,41,0
+    KERNEL4x8_L2 128,64,42,0
+    KERNEL4x8_L2 128,64,43,0  
+    KERNEL4x8_L2 128,64,44,0
+    KERNEL4x8_L2 128,64,45,0
+    KERNEL4x8_L2 128,64,46,0
+    KERNEL4x8_L2 128,64,47,0 
+    KERNEL4x8_L2 128,64,48,0
+    KERNEL4x8_L2 128,64,49,0 
+    KERNEL4x8_L2 128,64,50,0
+    KERNEL4x8_L2 128,64,51,0  
+    KERNEL4x8_L2 128,64,52,0
+    KERNEL4x8_L2 128,64,53,0 
+    KERNEL4x8_L2 128,64,54,0
+    KERNEL4x8_L2 128,64,55,0  
+    KERNEL4x8_L2 128,64,56,0
+    KERNEL4x8_L2 128,64,57,0
+    KERNEL4x8_L2 128,64,58,0
+    KERNEL4x8_L2 128,64,59,0  
+    KERNEL4x8_L2 128,64,60,0
+    KERNEL4x8_L2 128,64,61,0
+    KERNEL4x8_L2 128,64,62,0 
+    KERNEL4x8_L2 128,64,63,1  
+    bdnz    CGEMM_L4x8_LOOP
+    MY_ALIGN  
+CGEMM_L4x8_LOOP_END:
+/*----------------------------------------*/   
+    END4x8_2
+    blr
+    MY_ALIGN
+
+
+CGEMM_4x8_L64_SUB:
+/*----------------------------------------*/   
+    LOAD4x8_2  
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL4x8_L2 128,64,0,0 
+    KERNEL4x8_L2 128,64,1,0
+    dcbt    AO, T2  
+    KERNEL4x8_L2 128,64,2,0
+    KERNEL4x8_L2 128,64,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL4x8_L2 128,64,4,0
+    KERNEL4x8_L2 128,64,5,0
+    dcbt    AO, T4  
+    KERNEL4x8_L2 128,64,6,0
+    KERNEL4x8_L2 128,64,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL4x8_L2 128,64,8,0
+    KERNEL4x8_L2 128,64,9,0
+    KERNEL4x8_L2 128,64,10,0
+    KERNEL4x8_L2 128,64,11,0  
+    dcbt    BO, T4
+    KERNEL4x8_L2 128,64,12,0
+    KERNEL4x8_L2 128,64,13,0
+    KERNEL4x8_L2 128,64,14,0
+    KERNEL4x8_L2 128,64,15,0  
+    KERNEL4x8_L2 128,64,16,0
+    KERNEL4x8_L2 128,64,17,0 
+    KERNEL4x8_L2 128,64,18,0
+    KERNEL4x8_L2 128,64,19,0  
+    KERNEL4x8_L2 128,64,20,0
+    KERNEL4x8_L2 128,64,21,0 
+    KERNEL4x8_L2 128,64,22,0
+    KERNEL4x8_L2 128,64,23,0   
+    KERNEL4x8_L2 128,64,24,0
+    KERNEL4x8_L2 128,64,25,0
+    KERNEL4x8_L2 128,64,26,0
+    KERNEL4x8_L2 128,64,27,0  
+    KERNEL4x8_L2 128,64,28,0
+    KERNEL4x8_L2 128,64,29,0
+    KERNEL4x8_L2 128,64,30,0
+    KERNEL4x8_E2 128,64,31,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_4x8_L32_SUB:
+/*----------------------------------------*/   
+    LOAD4x8_2  
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL4x8_L2 128,64,0,0 
+    KERNEL4x8_L2 128,64,1,0
+    dcbt    AO, T2  
+    KERNEL4x8_L2 128,64,2,0
+    KERNEL4x8_L2 128,64,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL4x8_L2 128,64,4,0
+    KERNEL4x8_L2 128,64,5,0
+    dcbt    AO, T4  
+    KERNEL4x8_L2 128,64,6,0
+    KERNEL4x8_L2 128,64,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL4x8_L2 128,64,8,0
+    KERNEL4x8_L2 128,64,9,0
+    KERNEL4x8_L2 128,64,10,0
+    KERNEL4x8_L2 128,64,11,0  
+    dcbt    BO, T4
+    KERNEL4x8_L2 128,64,12,0
+    KERNEL4x8_L2 128,64,13,0
+    KERNEL4x8_L2 128,64,14,0
+    KERNEL4x8_E2 128,64,15,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_4x8_L16_SUB:
+/*----------------------------------------*/   
+    LOAD4x8_2 
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL4x8_L2 128,64,0,0 
+    KERNEL4x8_L2 128,64,1,0
+    dcbt    AO, T2  
+    KERNEL4x8_L2 128,64,2,0
+    KERNEL4x8_L2 128,64,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL4x8_L2 128,64,4,0
+    KERNEL4x8_L2 128,64,5,0
+    dcbt    AO, T4  
+    KERNEL4x8_L2 128,64,6,0
+    KERNEL4x8_E2 128,64,7,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_4x4_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD4x4_2  
+    MY_ALIGN
+CGEMM_L4x4_LOOP:
+/*----------------------------------------*/   
+    KERNEL4x4_L2 64,64,0,0
+CGEMM_L4x4_K32:
+/*----------------------------------------*/   
+    KERNEL4x4_L2 64,64,1,0   
+    KERNEL4x4_L2 64,64,2,0
+    KERNEL4x4_L2 64,64,3,0  
+    KERNEL4x4_L2 64,64,4,0
+    KERNEL4x4_L2 64,64,5,0 
+    KERNEL4x4_L2 64,64,6,0
+    KERNEL4x4_L2 64,64,7,0
+    KERNEL4x4_L2 64,64,8,0
+    KERNEL4x4_L2 64,64,9,0   
+    KERNEL4x4_L2 64,64,10,0
+    KERNEL4x4_L2 64,64,11,0  
+    KERNEL4x4_L2 64,64,12,0
+    KERNEL4x4_L2 64,64,13,0 
+    KERNEL4x4_L2 64,64,14,0
+    KERNEL4x4_L2 64,64,15,1    
+    bdnz    CGEMM_L4x4_LOOP
+    MY_ALIGN  
+CGEMM_L4x4_LOOP_END:
+/*----------------------------------------*/   
+    END4x4_2 
+    blr
+    MY_ALIGN
+
+
+CGEMM_4x4_L16_SUB:
+/*----------------------------------------*/   
+    LOAD4x4_2
+    KERNEL4x4_L2 64,64,0,0
+    KERNEL4x4_L2 64,64,1,0   
+    KERNEL4x4_L2 64,64,2,0
+    KERNEL4x4_L2 64,64,3,0  
+    KERNEL4x4_L2 64,64,4,0
+    KERNEL4x4_L2 64,64,5,0 
+    KERNEL4x4_L2 64,64,6,0
+    KERNEL4x4_E2 64,64,7,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_4x4_L8_SUB:
+/*----------------------------------------*/   
+    LOAD4x4_2
+    KERNEL4x4_L2 64,64,0,0
+    KERNEL4x4_L2 64,64,1,0   
+    KERNEL4x4_L2 64,64,2,0
+    KERNEL4x4_E2 64,64,3,1 
+    blr
+
+
+CGEMM_4x2_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD4x2_2  
+    MY_ALIGN 
+CGEMM_L4x2_LOOP:
+/*----------------------------------------*/   
+    KERNEL4x2_L2 32,64,0,0 
+CGEMM_L4x2_K32:
+/*----------------------------------------*/   
+    KERNEL4x2_L2 32,64,1,0  
+    KERNEL4x2_L2 32,64,2,0
+    KERNEL4x2_L2 32,64,3,0  
+    KERNEL4x2_L2 32,64,4,0
+    KERNEL4x2_L2 32,64,5,0 
+    KERNEL4x2_L2 32,64,6,0
+    KERNEL4x2_L2 32,64,7,0
+    KERNEL4x2_L2 32,64,8,0
+    KERNEL4x2_L2 32,64,9,0  
+    KERNEL4x2_L2 32,64,10,0
+    KERNEL4x2_L2 32,64,11,0  
+    KERNEL4x2_L2 32,64,12,0
+    KERNEL4x2_L2 32,64,13,0 
+    KERNEL4x2_L2 32,64,14,0
+    KERNEL4x2_L2 32,64,15,1   
+    bdnz    CGEMM_L4x2_LOOP
+    MY_ALIGN  
+
+
+CGEMM_L4x2_LOOP_END:
+/*----------------------------------------*/   
+    END4x2_2 
+    blr
+    MY_ALIGN
+CGEMM_4x2_L16_SUB:
+/*----------------------------------------*/   
+    LOAD4x2_2
+    KERNEL4x2_L2 32,64,0,0
+    KERNEL4x2_L2 32,64,1,0  
+    KERNEL4x2_L2 32,64,2,0
+    KERNEL4x2_L2 32,64,3,0  
+    KERNEL4x2_L2 32,64,4,0
+    KERNEL4x2_L2 32,64,5,0 
+    KERNEL4x2_L2 32,64,6,0
+    KERNEL4x2_E2 32,64,7,1
+    blr
+    MY_ALIGN
+CGEMM_4x2_L8_SUB:
+/*----------------------------------------*/   
+    LOAD4x2_2
+    KERNEL4x2_L2 32,64,0,0
+    KERNEL4x2_L2 32,64,1,0  
+    KERNEL4x2_L2 32,64,2,0
+    KERNEL4x2_E2 32,64,3,1  
+    blr
+
+
+CGEMM_4x1_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD4x1_2  
+    MY_ALIGN
+CGEMM_L4x1_LOOP:
+/*----------------------------------------*/   
+    KERNEL4x1_L2 16,64,0,0 
+CGEMM_L4x1_K32:
+/*----------------------------------------*/   
+    KERNEL4x1_L2 16,64,1,0  
+    KERNEL4x1_L2 16,64,2,0
+    KERNEL4x1_L2 16,64,3,0  
+    KERNEL4x1_L2 16,64,4,0
+    KERNEL4x1_L2 16,64,5,0 
+    KERNEL4x1_L2 16,64,6,0
+    KERNEL4x1_L2 16,64,7,0
+    KERNEL4x1_L2 16,64,8,0
+    KERNEL4x1_L2 16,64,9,0  
+    KERNEL4x1_L2 16,64,10,0
+    KERNEL4x1_L2 16,64,11,0  
+    KERNEL4x1_L2 16,64,12,0
+    KERNEL4x1_L2 16,64,13,0 
+    KERNEL4x1_L2 16,64,14,0
+    KERNEL4x1_L2 16,64,15,1   
+    bdnz    CGEMM_L4x1_LOOP
+    MY_ALIGN  
+CGEMM_L4x1_LOOP_END:
+/*----------------------------------------*/   
+    END4x1_2 
+    blr
+
+    MY_ALIGN
+CGEMM_4x1_L16_SUB:
+/*----------------------------------------*/   
+    LOAD4x1_2
+    KERNEL4x1_L2 16,64,0,0
+    KERNEL4x1_L2 16,64,1,0  
+    KERNEL4x1_L2 16,64,2,0
+    KERNEL4x1_L2 16,64,3,0  
+    KERNEL4x1_L2 16,64,4,0
+    KERNEL4x1_L2 16,64,5,0 
+    KERNEL4x1_L2 16,64,6,0
+    KERNEL4x1_E2 16,64,7,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_4x1_L8_SUB:
+/*----------------------------------------*/   
+    LOAD4x1_2
+    KERNEL4x1_L2 16,64,0,0
+    KERNEL4x1_L2 16,64,1,0  
+    KERNEL4x1_L2 16,64,2,0
+    KERNEL4x1_E2 16,64,3,1  
+    blr
+
+
+
+/*             MAIN LOOP BEGINS               */   
+    MY_ALIGN
+
+
+CGEMM_L4:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL) && !defined(LEFT)   
+    neg TEMP_REG, OFFSET 
+#endif   
+    /* Pre set value in vs57 as 0xffff0000ffff0000 for masking */
+    vspltisb v24, -1
+    vspltisb v25, 0
+    xxsldwi vs57, vs56, vs57, 1
+    xxpermdi vs57, vs57, vs57, 3
+    srawi.    J,  N,  2
+    ble   CGEMM_L4_END
+
+
+CGEMM_L4_BEGIN:
+/*----------------------------------------*/   
+    mr    CO, C
+    slwi    T1, LDC , 2     
+    add     T2,C,LDC    
+    mr    AO, A  
+    add   C,  C,  T1
+#if defined(TRMMKERNEL) && defined(LEFT)   
+    mr TEMP_REG, OFFSET  /*off = offset;*/
+#endif     
+    srawi.    I,  M,  3
+    ble   CGEMM_L4x8_END
+    dcbt    CO,r0  /*just prefetch*/
+    dcbt    T2,r0    
+
+
+CGEMM_L4x8_BEGIN:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,4
+#else    
+    mr    BO, B  
+    dcbt    B,  r0  
+#endif     
+    dcbt    AO, r0
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,8,4
+    mr T1, T6
+/* TEMPS FOR PREFETCH */   
+    li T2, 1024
+    li T3, 1024+512
+    addi T1,T1, -2
+/* TEMPS FOR PREFETCH */     
+    li T4, 2048
+    li T5, 2048+512   
+    srawi.   T8, T1, 7 /**(T1-2) % 128x */
+#else   
+    mr T1, K
+/* TEMPS FOR PREFETCH */   
+    li T2, 1024
+    li T3, 1024+512
+    addi T1,T1, -2
+/* TEMPS FOR PREFETCH */     
+    li T4, 2048
+    li T5, 2048+512 
+    srawi.   T8, T1, 7 /**(K-2) % 128x */
+#endif   
+    ZERO4x8  
+    ble   CGEMM_L4x8_SUB0
+    bl CGEMM_L4x8_LMAIN_SUB
+    andi.   L,  T1, 127
+    ble   CGEMM_L4x8_SAVE
+    b   CGEMM_L4x8_SUB2
+
+
+CGEMM_L4x8_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 255
+    cmpwi   T6,129
+#else   
+    andi.   L,  K,  255
+    cmpwi   K,129
+#endif       
+    li T8,1
+    bne CMP4x8_128K
+    addi BO,BO,-32
+    addi AO,AO,-64 
+    LOAD4x8O 64,32 
+    END4x8_WITHOUT_ADD   
+    LOAD4x8_2O  128, 64 
+    mtctr   T8    
+    bl CGEMM_L4x8_K128   
+    b CGEMM_L4x8_SAVE  
+    CMP4x8_128K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,128
+#else    
+    cmpwi   K,128
+#endif        
+    bne CGEMM_L4x8_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-64
+    addi AO,AO,-128   
+    LOAD4x8_2O 128,64
+    bl CGEMM_L4x8_K128   
+    b CGEMM_L4x8_SAVE 
+    MY_ALIGN
+
+
+CGEMM_L4x8_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 64
+    ble CGEMM_L4x8_SUB2_32
+    bl  CGEMM_4x8_L64_SUB
+    MY_ALIGN
+
+
+CGEMM_L4x8_SUB2_32:
+/*----------------------------------------*/   
+    andi.      T1,L, 32
+    ble CGEMM_L4x8_SUB2_16    
+    bl  CGEMM_4x8_L32_SUB
+    MY_ALIGN 
+
+
+CGEMM_L4x8_SUB2_16:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L4x8_SUB2_8
+    bl  CGEMM_4x8_L16_SUB  
+    MY_ALIGN    
+
+
+CGEMM_L4x8_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L4x8_SUB2_4
+    LOAD4x8_2
+    KERNEL4x8_L2  128,64, 0,0
+    KERNEL4x8_L2  128,64, 1,0
+    KERNEL4x8_L2  128,64, 2,0
+    KERNEL4x8_E2  128,64, 3,1
+    MY_ALIGN   
+
+
+CGEMM_L4x8_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L4x8_SUB2_2
+    LOAD4x8_2
+    KERNEL4x8_L2  128,64, 0,0
+    KERNEL4x8_E2  128,64, 1,1
+    MY_ALIGN
+
+
+CGEMM_L4x8_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L4x8_SUB2_1
+    LOAD4x8_2 
+    KERNEL4x8_E2  128,64, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L4x8_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L4x8_SAVE 
+    KERNEL4x8
+
+    MY_ALIGN
+CGEMM_L4x8_SAVE:
+/*----------------------------------------*/   
+    addic.    I,  I,  -1
+    MY_ALIGN
+    SAVE4x8
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,4
+#endif     
+    bgt   CGEMM_L4x8_BEGIN
+    andi.   T2, M,  7
+    ble   CGEMM_L4x1_END
+    andi.   T1, M,  4
+    ble   CGEMM_L4x4_END
+    b   CGEMM_L4x4_BEGIN
+    MY_ALIGN 
+
+
+CGEMM_L4x8_END:
+/*----------------------------------------*/   
+
+
+CGEMM_L4x4_BEGIN:
+/*----------------------------------------*/   
+    andi.   T2, M,  7
+    ble   CGEMM_L4x1_END
+    andi.   T1, M,  4
+    ble   CGEMM_L4x4_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,4
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,4,4
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T1-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    ZERO4x4
+    ble   CGEMM_L4x4_SUB0 
+    bl CGEMM_4x4_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   CGEMM_L4x4_SAVE
+    b    CGEMM_L4x4_SUB2
+
+
+CGEMM_L4x4_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP4x4_32K
+    addi BO,BO,-32
+    addi AO,AO,-32  
+    LOAD4x4O 32,32 
+    END4x4_WITHOUT_ADD   
+    LOAD4x4_2O  64, 64 
+    mtctr   T8    
+    bl CGEMM_L4x4_K32   
+    b CGEMM_L4x4_SAVE  
+    CMP4x4_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne CGEMM_L4x4_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-64
+    addi AO,AO,-64   
+    LOAD4x4_2O 64,64
+    bl CGEMM_L4x4_K32   
+    b CGEMM_L4x4_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+CGEMM_L4x4_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L4x4_SUB2_8
+    bl  CGEMM_4x4_L16_SUB  
+    MY_ALIGN
+
+
+CGEMM_L4x4_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L4x4_SUB2_4
+    bl CGEMM_4x4_L8_SUB
+    MY_ALIGN  
+
+
+CGEMM_L4x4_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L4x4_SUB2_2
+    LOAD4x4_2
+    KERNEL4x4_L2  64,64, 0,0
+    KERNEL4x4_E2  64,64, 1,1
+    MY_ALIGN
+
+
+CGEMM_L4x4_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L4x4_SUB2_1
+    LOAD4x4_2
+    KERNEL4x4_E2  64,64, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L4x4_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L4x4_SAVE 
+    KERNEL4x4
+
+
+CGEMM_L4x4_SAVE:
+/*----------------------------------------*/   
+    SAVE4x4
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,4
+#endif     
+
+
+CGEMM_L4x4_END:
+/*----------------------------------------*/   
+
+
+CGEMM_L4x2_BEGIN:
+/*----------------------------------------*/   
+    andi.   T1, M,  2
+    ble   CGEMM_L4x2_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,4
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,2,4
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T1-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    ZERO4x2
+    ble   CGEMM_L4x2_SUB0 
+    bl CGEMM_4x2_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   CGEMM_L4x2_SAVE
+    b   CGEMM_L4x2_SUB2
+
+
+CGEMM_L4x2_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP4x2_32K
+    addi BO,BO,-32
+    addi AO,AO,-16  
+    LOAD4x2O 16,32 
+    END4x2_WITHOUT_ADD   
+    LOAD4x2_2O  32, 64  
+    mtctr   T8    
+    bl CGEMM_L4x2_K32   
+    b CGEMM_L4x2_SAVE  
+    CMP4x2_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne CGEMM_L4x2_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-64
+    addi AO,AO,-32   
+    LOAD4x2_2O 32,64
+    bl CGEMM_L4x2_K32   
+    b CGEMM_L4x2_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+CGEMM_L4x2_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L4x2_SUB2_8
+    bl CGEMM_4x2_L16_SUB  
+    MY_ALIGN
+
+
+CGEMM_L4x2_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L4x2_SUB2_4
+    bl CGEMM_4x2_L8_SUB
+    MY_ALIGN  
+
+
+CGEMM_L4x2_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L4x2_SUB2_2
+    LOAD4x2_2
+    KERNEL4x2_L2  32,64, 0,0
+    KERNEL4x2_E2  32,64, 1,1
+    MY_ALIGN
+
+
+CGEMM_L4x2_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L4x2_SUB2_1
+    LOAD4x2_2
+    KERNEL4x2_E2  32,64, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L4x2_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L4x2_SAVE 
+    KERNEL4x2
+
+    MY_ALIGN
+CGEMM_L4x2_SAVE:
+/*----------------------------------------*/   
+    SAVE4x2
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,4
+#endif     
+
+
+CGEMM_L4x2_END:
+/*----------------------------------------*/   
+
+
+CGEMM_L4x1_BEGIN:
+/*----------------------------------------*/   
+    andi.   T1, M,  1
+    ble   CGEMM_L4x1_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,4
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,1,4
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T1-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    ZERO4x1
+    ble   CGEMM_L4x1_SUB0 
+    bl CGEMM_4x1_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   CGEMM_L4x1_SAVE
+    b   CGEMM_L4x1_SUB2
+
+
+CGEMM_L4x1_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP4x1_32K
+    addi BO,BO,-32
+    addi AO,AO,-8  
+    LOAD4x1O 8,32 
+    END4x1_WITHOUT_ADD   
+    LOAD4x1_2O  16, 64  
+    mtctr   T8    
+    bl CGEMM_L4x1_K32   
+    b CGEMM_L4x1_SAVE  
+    CMP4x1_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne CGEMM_L4x1_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-64
+    addi AO,AO,-16   
+    LOAD4x1_2O 16,64
+    bl CGEMM_L4x1_K32   
+    b CGEMM_L4x1_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+CGEMM_L4x1_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L4x1_SUB2_8
+    bl CGEMM_4x1_L16_SUB  
+    MY_ALIGN
+
+
+CGEMM_L4x1_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L4x1_SUB2_4
+    bl CGEMM_4x1_L8_SUB
+    MY_ALIGN  
+
+
+CGEMM_L4x1_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L4x1_SUB2_2
+    LOAD4x1_2
+    KERNEL4x1_L2  16,64, 0,0
+    KERNEL4x1_E2  16,64, 1,1
+    MY_ALIGN
+
+
+CGEMM_L4x1_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L4x1_SUB2_1
+    LOAD4x1_2
+    KERNEL4x1_E2  16,64, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L4x1_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L4x1_SAVE 
+    KERNEL4x1
+
+    MY_ALIGN
+CGEMM_L4x1_SAVE:
+/*----------------------------------------*/  
+     
+    SAVE4x1
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,4
+#endif   
+
+
+CGEMM_L4x1_END:
+/*----------------------------------------*/   
+    slwi    T1, K,  5
+    addic.    J,  J,  -1
+    add   B,  B,  T1
+#if defined(TRMMKERNEL) && !defined(LEFT)   
+    addi TEMP_REG, TEMP_REG, 4
+#endif   
+    bgt   CGEMM_L4_BEGIN
+
+
+CGEMM_L4_END:
+
+b CGEMM_L2
+/*                MINI SUBROUTINES                            */      
+/*                2x8 MAIN 128x+2 LOOP                     */
+
+
+CGEMM_L2x8_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD2x8_2 
+    MY_ALIGN
+CGEMM_L2x8_LOOP:
+/*----------------------------------------*/   
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL2x8_L2 128,32,0,0 
+CGEMM_L2x8_K128:
+/*----------------------------------------*/   
+    KERNEL2x8_L2 128,32,1,0
+    dcbt    AO, T2  
+    KERNEL2x8_L2 128,32,2,0
+    KERNEL2x8_L2 128,32,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL2x8_L2 128,32,4,0
+    KERNEL2x8_L2 128,32,5,0
+    dcbt    AO, T4  
+    KERNEL2x8_L2 128,32,6,0
+    KERNEL2x8_L2 128,32,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL2x8_L2 128,32,8,0
+    KERNEL2x8_L2 128,32,9,0
+    KERNEL2x8_L2 128,32,10,0
+    KERNEL2x8_L2 128,32,11,0  
+    dcbt    BO, T4
+    KERNEL2x8_L2 128,32,12,0
+    KERNEL2x8_L2 128,32,13,0
+    KERNEL2x8_L2 128,32,14,0
+    KERNEL2x8_L2 128,32,15,0  
+    KERNEL2x8_L2 128,32,16,0
+    KERNEL2x8_L2 128,32,17,0 
+    KERNEL2x8_L2 128,32,18,0
+    KERNEL2x8_L2 128,32,19,0  
+    KERNEL2x8_L2 128,32,20,0
+    KERNEL2x8_L2 128,32,21,0 
+    KERNEL2x8_L2 128,32,22,0
+    KERNEL2x8_L2 128,32,23,0   
+    KERNEL2x8_L2 128,32,24,0
+    KERNEL2x8_L2 128,32,25,0
+    KERNEL2x8_L2 128,32,26,0
+    KERNEL2x8_L2 128,32,27,0  
+    KERNEL2x8_L2 128,32,28,0
+    KERNEL2x8_L2 128,32,29,0
+    KERNEL2x8_L2 128,32,30,0
+    KERNEL2x8_L2 128,32,31,0 
+    KERNEL2x8_L2 128,32,32,0
+    KERNEL2x8_L2 128,32,33,0
+    KERNEL2x8_L2 128,32,34,0
+    KERNEL2x8_L2 128,32,35,0 
+    KERNEL2x8_L2 128,32,36,0
+    KERNEL2x8_L2 128,32,37,0
+    KERNEL2x8_L2 128,32,38,0
+    KERNEL2x8_L2 128,32,39,0  
+    KERNEL2x8_L2 128,32,40,0
+    KERNEL2x8_L2 128,32,41,0
+    KERNEL2x8_L2 128,32,42,0
+    KERNEL2x8_L2 128,32,43,0  
+    KERNEL2x8_L2 128,32,44,0
+    KERNEL2x8_L2 128,32,45,0
+    KERNEL2x8_L2 128,32,46,0
+    KERNEL2x8_L2 128,32,47,0 
+    KERNEL2x8_L2 128,32,48,0
+    KERNEL2x8_L2 128,32,49,0 
+    KERNEL2x8_L2 128,32,50,0
+    KERNEL2x8_L2 128,32,51,0  
+    KERNEL2x8_L2 128,32,52,0
+    KERNEL2x8_L2 128,32,53,0 
+    KERNEL2x8_L2 128,32,54,0
+    KERNEL2x8_L2 128,32,55,0  
+    KERNEL2x8_L2 128,32,56,0
+    KERNEL2x8_L2 128,32,57,0
+    KERNEL2x8_L2 128,32,58,0
+    KERNEL2x8_L2 128,32,59,0  
+    KERNEL2x8_L2 128,32,60,0
+    KERNEL2x8_L2 128,32,61,0
+    KERNEL2x8_L2 128,32,62,0 
+    KERNEL2x8_L2 128,32,63,1  
+    bdnz    CGEMM_L2x8_LOOP
+    MY_ALIGN  
+CGEMM_L2x8_LOOP_END:
+/*----------------------------------------*/   
+    END2x8_2
+    blr
+    MY_ALIGN
+
+
+CGEMM_2x8_L64_SUB:
+/*----------------------------------------*/   
+    LOAD2x8_2  
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL2x8_L2 128,32,0,0 
+    KERNEL2x8_L2 128,32,1,0
+    dcbt    AO, T2  
+    KERNEL2x8_L2 128,32,2,0
+    KERNEL2x8_L2 128,32,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL2x8_L2 128,32,4,0
+    KERNEL2x8_L2 128,32,5,0
+    dcbt    AO, T4  
+    KERNEL2x8_L2 128,32,6,0
+    KERNEL2x8_L2 128,32,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL2x8_L2 128,32,8,0
+    KERNEL2x8_L2 128,32,9,0
+    KERNEL2x8_L2 128,32,10,0
+    KERNEL2x8_L2 128,32,11,0  
+    dcbt    BO, T4
+    KERNEL2x8_L2 128,32,12,0
+    KERNEL2x8_L2 128,32,13,0
+    KERNEL2x8_L2 128,32,14,0
+    KERNEL2x8_L2 128,32,15,0  
+    KERNEL2x8_L2 128,32,16,0
+    KERNEL2x8_L2 128,32,17,0 
+    KERNEL2x8_L2 128,32,18,0
+    KERNEL2x8_L2 128,32,19,0  
+    KERNEL2x8_L2 128,32,20,0
+    KERNEL2x8_L2 128,32,21,0 
+    KERNEL2x8_L2 128,32,22,0
+    KERNEL2x8_L2 128,32,23,0   
+    KERNEL2x8_L2 128,32,24,0
+    KERNEL2x8_L2 128,32,25,0
+    KERNEL2x8_L2 128,32,26,0
+    KERNEL2x8_L2 128,32,27,0  
+    KERNEL2x8_L2 128,32,28,0
+    KERNEL2x8_L2 128,32,29,0
+    KERNEL2x8_L2 128,32,30,0
+    KERNEL2x8_E2 128,32,31,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_2x8_L32_SUB:
+/*----------------------------------------*/   
+    LOAD2x8_2  
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL2x8_L2 128,32,0,0 
+    KERNEL2x8_L2 128,32,1,0
+    dcbt    AO, T2  
+    KERNEL2x8_L2 128,32,2,0
+    KERNEL2x8_L2 128,32,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL2x8_L2 128,32,4,0
+    KERNEL2x8_L2 128,32,5,0
+    dcbt    AO, T4  
+    KERNEL2x8_L2 128,32,6,0
+    KERNEL2x8_L2 128,32,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL2x8_L2 128,32,8,0
+    KERNEL2x8_L2 128,32,9,0
+    KERNEL2x8_L2 128,32,10,0
+    KERNEL2x8_L2 128,32,11,0  
+    dcbt    BO, T4
+    KERNEL2x8_L2 128,32,12,0
+    KERNEL2x8_L2 128,32,13,0
+    KERNEL2x8_L2 128,32,14,0
+    KERNEL2x8_E2 128,32,15,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_2x8_L16_SUB:
+/*----------------------------------------*/   
+    LOAD2x8_2 
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL2x8_L2 128,32,0,0 
+    KERNEL2x8_L2 128,32,1,0
+    dcbt    AO, T2  
+    KERNEL2x8_L2 128,32,2,0
+    KERNEL2x8_L2 128,32,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL2x8_L2 128,32,4,0
+    KERNEL2x8_L2 128,32,5,0
+    dcbt    AO, T4  
+    KERNEL2x8_L2 128,32,6,0
+    KERNEL2x8_E2 128,32,7,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_2x4_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD2x4_2  
+    MY_ALIGN
+CGEMM_L2x4_LOOP:
+/*----------------------------------------*/   
+    KERNEL2x4_L2 64,32,0,0
+CGEMM_L2x4_K32:
+/*----------------------------------------*/   
+    KERNEL2x4_L2 64,32,1,0   
+    KERNEL2x4_L2 64,32,2,0
+    KERNEL2x4_L2 64,32,3,0  
+    KERNEL2x4_L2 64,32,4,0
+    KERNEL2x4_L2 64,32,5,0 
+    KERNEL2x4_L2 64,32,6,0
+    KERNEL2x4_L2 64,32,7,0
+    KERNEL2x4_L2 64,32,8,0
+    KERNEL2x4_L2 64,32,9,0   
+    KERNEL2x4_L2 64,32,10,0
+    KERNEL2x4_L2 64,32,11,0  
+    KERNEL2x4_L2 64,32,12,0
+    KERNEL2x4_L2 64,32,13,0 
+    KERNEL2x4_L2 64,32,14,0
+    KERNEL2x4_L2 64,32,15,1    
+    bdnz    CGEMM_L2x4_LOOP
+    MY_ALIGN  
+CGEMM_L2x4_LOOP_END:
+/*----------------------------------------*/   
+    END2x4_2 
+    blr
+    MY_ALIGN
+
+
+CGEMM_2x4_L16_SUB:
+/*----------------------------------------*/   
+    LOAD2x4_2
+    KERNEL2x4_L2 64,32,0,0
+    KERNEL2x4_L2 64,32,1,0   
+    KERNEL2x4_L2 64,32,2,0
+    KERNEL2x4_L2 64,32,3,0  
+    KERNEL2x4_L2 64,32,4,0
+    KERNEL2x4_L2 64,32,5,0 
+    KERNEL2x4_L2 64,32,6,0
+    KERNEL2x4_E2 64,32,7,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_2x4_L8_SUB:
+/*----------------------------------------*/   
+    LOAD2x4_2
+    KERNEL2x4_L2 64,32,0,0
+    KERNEL2x4_L2 64,32,1,0   
+    KERNEL2x4_L2 64,32,2,0
+    KERNEL2x4_E2 64,32,3,1 
+    blr
+
+
+CGEMM_2x2_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD2x2_2  
+    MY_ALIGN 
+CGEMM_L2x2_LOOP:
+/*----------------------------------------*/   
+    KERNEL2x2_L2 32,32,0,0 
+CGEMM_L2x2_K32:
+/*----------------------------------------*/   
+    KERNEL2x2_L2 32,32,1,0  
+    KERNEL2x2_L2 32,32,2,0
+    KERNEL2x2_L2 32,32,3,0  
+    KERNEL2x2_L2 32,32,4,0
+    KERNEL2x2_L2 32,32,5,0 
+    KERNEL2x2_L2 32,32,6,0
+    KERNEL2x2_L2 32,32,7,0
+    KERNEL2x2_L2 32,32,8,0
+    KERNEL2x2_L2 32,32,9,0  
+    KERNEL2x2_L2 32,32,10,0
+    KERNEL2x2_L2 32,32,11,0  
+    KERNEL2x2_L2 32,32,12,0
+    KERNEL2x2_L2 32,32,13,0 
+    KERNEL2x2_L2 32,32,14,0
+    KERNEL2x2_L2 32,32,15,1   
+    bdnz    CGEMM_L2x2_LOOP
+    MY_ALIGN  
+
+
+CGEMM_L2x2_LOOP_END:
+/*----------------------------------------*/   
+    END2x2_2 
+    blr
+    MY_ALIGN
+CGEMM_2x2_L16_SUB:
+/*----------------------------------------*/   
+    LOAD2x2_2
+    KERNEL2x2_L2 32,32,0,0
+    KERNEL2x2_L2 32,32,1,0  
+    KERNEL2x2_L2 32,32,2,0
+    KERNEL2x2_L2 32,32,3,0  
+    KERNEL2x2_L2 32,32,4,0
+    KERNEL2x2_L2 32,32,5,0 
+    KERNEL2x2_L2 32,32,6,0
+    KERNEL2x2_E2 32,32,7,1
+    blr
+    MY_ALIGN
+CGEMM_2x2_L8_SUB:
+/*----------------------------------------*/   
+    LOAD2x2_2
+    KERNEL2x2_L2 32,32,0,0
+    KERNEL2x2_L2 32,32,1,0  
+    KERNEL2x2_L2 32,32,2,0
+    KERNEL2x2_E2 32,32,3,1  
+    blr
+
+
+CGEMM_2x1_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD2x1_2  
+    MY_ALIGN
+CGEMM_L2x1_LOOP:
+/*----------------------------------------*/   
+    KERNEL2x1_L2 16,32,0,0 
+CGEMM_L2x1_K32:
+/*----------------------------------------*/   
+    KERNEL2x1_L2 16,32,1,0  
+    KERNEL2x1_L2 16,32,2,0
+    KERNEL2x1_L2 16,32,3,0  
+    KERNEL2x1_L2 16,32,4,0
+    KERNEL2x1_L2 16,32,5,0 
+    KERNEL2x1_L2 16,32,6,0
+    KERNEL2x1_L2 16,32,7,0
+    KERNEL2x1_L2 16,32,8,0
+    KERNEL2x1_L2 16,32,9,0  
+    KERNEL2x1_L2 16,32,10,0
+    KERNEL2x1_L2 16,32,11,0  
+    KERNEL2x1_L2 16,32,12,0
+    KERNEL2x1_L2 16,32,13,0 
+    KERNEL2x1_L2 16,32,14,0
+    KERNEL2x1_L2 16,32,15,1   
+    bdnz    CGEMM_L2x1_LOOP
+    MY_ALIGN  
+CGEMM_L2x1_LOOP_END:
+/*----------------------------------------*/   
+    END2x1_2 
+    blr
+
+    MY_ALIGN
+CGEMM_2x1_L16_SUB:
+/*----------------------------------------*/   
+    LOAD2x1_2
+    KERNEL2x1_L2 16,32,0,0
+    KERNEL2x1_L2 16,32,1,0  
+    KERNEL2x1_L2 16,32,2,0
+    KERNEL2x1_L2 16,32,3,0  
+    KERNEL2x1_L2 16,32,4,0
+    KERNEL2x1_L2 16,32,5,0 
+    KERNEL2x1_L2 16,32,6,0
+    KERNEL2x1_E2 16,32,7,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_2x1_L8_SUB:
+/*----------------------------------------*/   
+    LOAD2x1_2
+    KERNEL2x1_L2 16,32,0,0
+    KERNEL2x1_L2 16,32,1,0  
+    KERNEL2x1_L2 16,32,2,0
+    KERNEL2x1_E2 16,32,3,1  
+    blr
+
+
+
+/*             MAIN LOOP BEGINS               */   
+    MY_ALIGN
+
+
+CGEMM_L2:
+/*----------------------------------------*/   
+
+    andi.    J,  N,  2
+    ble   CGEMM_L2_END
+
+
+CGEMM_L2_BEGIN:
+/*----------------------------------------*/   
+    mr    CO, C
+    slwi    T1, LDC , 1     
+    add     T2,C,LDC    
+    mr    AO, A  
+    add   C,  C,  T1
+#if defined(TRMMKERNEL) && defined(LEFT)   
+    mr TEMP_REG, OFFSET  /*off = offset;*/
+#endif     
+    srawi.    I,  M, 3
+    ble   CGEMM_L2x8_END
+    dcbt    CO,r0  /*just prefetch*/
+    dcbt    T2,r0    
+
+
+CGEMM_L2x8_BEGIN:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,2
+#else    
+    mr    BO, B  
+    dcbt    B,  r0  
+#endif     
+    dcbt    AO, r0
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,8,2
+    mr T1, T6
+/* TEMPS FOR PREFETCH */   
+    li T2, 1024
+    li T3, 1024+512
+    addi T1,T1, -2
+/* TEMPS FOR PREFETCH */     
+    li T4, 2048
+    li T5, 2048+512   
+    srawi.   T8, T1, 7 /**(T1-2) % 128x */
+#else   
+    mr T1, K
+/* TEMPS FOR PREFETCH */   
+    li T2, 1024
+    li T3, 1024+512
+    addi T1,T1, -2
+/* TEMPS FOR PREFETCH */     
+    li T4, 2048
+    li T5, 2048+512 
+    srawi.   T8, T1, 7 /**(K-2) % 128x */
+#endif   
+    ZERO2x8  
+    ble   CGEMM_L2x8_SUB0
+    bl CGEMM_L2x8_LMAIN_SUB
+    andi.   L,  T1, 127
+    ble   CGEMM_L2x8_SAVE
+    b   CGEMM_L2x8_SUB2
+
+
+CGEMM_L2x8_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 255
+    cmpwi   T6,129
+#else   
+    andi.   L,  K,  255
+    cmpwi   K,129
+#endif       
+    li T8,1
+    bne CMP2x8_128K
+    addi BO,BO,-16
+    addi AO,AO,-64 
+    LOAD2x8O 64,16 
+    END2x8_WITHOUT_ADD   
+    LOAD2x8_2O  128, 32 
+    mtctr   T8    
+    bl CGEMM_L2x8_K128   
+    b CGEMM_L2x8_SAVE  
+    CMP2x8_128K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,128
+#else    
+    cmpwi   K,128
+#endif        
+    bne CGEMM_L2x8_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-32
+    addi AO,AO,-128   
+    LOAD2x8_2O 128,32
+    bl CGEMM_L2x8_K128   
+    b CGEMM_L2x8_SAVE 
+    MY_ALIGN
+
+
+CGEMM_L2x8_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 64
+    ble CGEMM_L2x8_SUB2_32
+    bl  CGEMM_2x8_L64_SUB
+    MY_ALIGN
+
+
+CGEMM_L2x8_SUB2_32:
+/*----------------------------------------*/   
+    andi.      T1,L, 32
+    ble CGEMM_L2x8_SUB2_16    
+    bl  CGEMM_2x8_L32_SUB
+    MY_ALIGN 
+
+
+CGEMM_L2x8_SUB2_16:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L2x8_SUB2_8
+    bl  CGEMM_2x8_L16_SUB  
+    MY_ALIGN    
+
+
+CGEMM_L2x8_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L2x8_SUB2_4
+    LOAD2x8_2
+    KERNEL2x8_L2  128,32, 0,0
+    KERNEL2x8_L2  128,32, 1,0
+    KERNEL2x8_L2  128,32, 2,0
+    KERNEL2x8_E2  128,32, 3,1
+    MY_ALIGN   
+
+
+CGEMM_L2x8_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L2x8_SUB2_2
+    LOAD2x8_2
+    KERNEL2x8_L2  128,32, 0,0
+    KERNEL2x8_E2  128,32, 1,1
+    MY_ALIGN
+
+
+CGEMM_L2x8_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L2x8_SUB2_1
+    LOAD2x8_2 
+    KERNEL2x8_E2  128,32, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L2x8_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L2x8_SAVE 
+    KERNEL2x8
+
+    MY_ALIGN
+CGEMM_L2x8_SAVE:
+/*----------------------------------------*/   
+    addic.    I,  I,  -1
+    MY_ALIGN
+    SAVE2x8
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,2
+#endif     
+    bgt   CGEMM_L2x8_BEGIN
+    andi.   T2, M,  7
+    ble   CGEMM_L2x1_END
+    andi.   T1, M,  4
+    ble   CGEMM_L2x4_END
+    b   CGEMM_L2x4_BEGIN
+    MY_ALIGN 
+
+
+CGEMM_L2x8_END:
+/*----------------------------------------*/   
+
+
+CGEMM_L2x4_BEGIN:
+/*----------------------------------------*/   
+    andi.   T2, M,  7
+    ble   CGEMM_L2x1_END
+    andi.   T1, M,  4
+    ble   CGEMM_L2x4_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,2
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,4,2
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T1-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    ZERO2x4
+    ble   CGEMM_L2x4_SUB0 
+    bl CGEMM_2x4_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   CGEMM_L2x4_SAVE
+    b    CGEMM_L2x4_SUB2
+
+
+CGEMM_L2x4_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP2x4_32K
+    addi BO,BO,-16
+    addi AO,AO,-32  
+    LOAD2x4O 32,16 
+    END2x4_WITHOUT_ADD   
+    LOAD2x4_2O  64, 32 
+    mtctr   T8    
+    bl CGEMM_L2x4_K32   
+    b CGEMM_L2x4_SAVE  
+    CMP2x4_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne CGEMM_L2x4_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-32
+    addi AO,AO,-64   
+    LOAD2x4_2O 64,32
+    bl CGEMM_L2x4_K32   
+    b CGEMM_L2x4_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+CGEMM_L2x4_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L2x4_SUB2_8
+    bl  CGEMM_2x4_L16_SUB  
+    MY_ALIGN
+
+
+CGEMM_L2x4_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L2x4_SUB2_4
+    bl CGEMM_2x4_L8_SUB
+    MY_ALIGN  
+
+
+CGEMM_L2x4_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L2x4_SUB2_2
+    LOAD2x4_2
+    KERNEL2x4_L2  64,32, 0,0
+    KERNEL2x4_E2  64,32, 1,1
+    MY_ALIGN
+
+
+CGEMM_L2x4_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L2x4_SUB2_1
+    LOAD2x4_2
+    KERNEL2x4_E2  64,32, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L2x4_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L2x4_SAVE 
+    KERNEL2x4
+
+
+CGEMM_L2x4_SAVE:
+/*----------------------------------------*/   
+    SAVE2x4
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,2
+#endif     
+
+
+CGEMM_L2x4_END:
+/*----------------------------------------*/   
+
+
+CGEMM_L2x2_BEGIN:
+/*----------------------------------------*/   
+    andi.   T1, M,  2
+    ble   CGEMM_L2x2_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,2
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,2,2
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T1-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    ZERO2x2
+    ble   CGEMM_L2x2_SUB0 
+    bl CGEMM_2x2_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   CGEMM_L2x2_SAVE
+    b   CGEMM_L2x2_SUB2
+
+
+CGEMM_L2x2_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP2x2_32K
+    addi BO,BO,-16
+    addi AO,AO,-16  
+    LOAD2x2O 16,16 
+    END2x2_WITHOUT_ADD   
+    LOAD2x2_2O  32, 32  
+    mtctr   T8    
+    bl CGEMM_L2x2_K32   
+    b CGEMM_L2x2_SAVE  
+    CMP2x2_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne CGEMM_L2x2_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-32
+    addi AO,AO,-32   
+    LOAD2x2_2O 32,32
+    bl CGEMM_L2x2_K32   
+    b CGEMM_L2x2_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+CGEMM_L2x2_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L2x2_SUB2_8
+    bl CGEMM_2x2_L16_SUB  
+    MY_ALIGN
+
+
+CGEMM_L2x2_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L2x2_SUB2_4
+    bl CGEMM_2x2_L8_SUB
+    MY_ALIGN  
+
+
+CGEMM_L2x2_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L2x2_SUB2_2
+    LOAD2x2_2
+    KERNEL2x2_L2  32,32, 0,0
+    KERNEL2x2_E2  32,32, 1,1
+    MY_ALIGN
+
+
+CGEMM_L2x2_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L2x2_SUB2_1
+    LOAD2x2_2
+    KERNEL2x2_E2  32,32, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L2x2_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L2x2_SAVE 
+    KERNEL2x2
+
+    MY_ALIGN
+CGEMM_L2x2_SAVE:
+/*----------------------------------------*/   
+    SAVE2x2
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,2
+#endif     
+
+
+CGEMM_L2x2_END:
+/*----------------------------------------*/   
+
+
+CGEMM_L2x1_BEGIN:
+/*----------------------------------------*/   
+    andi.   T1, M,  1
+    ble   CGEMM_L2x1_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,2
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,1,2
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T1-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    ZERO2x1
+    ble   CGEMM_L2x1_SUB0 
+    bl CGEMM_2x1_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   CGEMM_L2x1_SAVE
+    b   CGEMM_L2x1_SUB2
+
+
+CGEMM_L2x1_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP2x1_32K
+    addi BO,BO,-16
+    addi AO,AO,-8  
+    LOAD2x1O 8,16 
+    END2x1_WITHOUT_ADD   
+    LOAD2x1_2O  16, 32  
+    mtctr   T8    
+    bl CGEMM_L2x1_K32   
+    b CGEMM_L2x1_SAVE  
+    CMP2x1_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne CGEMM_L2x1_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-32
+    addi AO,AO,-16   
+    LOAD2x1_2O 16,32
+    bl CGEMM_L2x1_K32   
+    b CGEMM_L2x1_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+CGEMM_L2x1_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L2x1_SUB2_8
+    bl CGEMM_2x1_L16_SUB  
+    MY_ALIGN
+
+
+CGEMM_L2x1_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L2x1_SUB2_4
+    bl CGEMM_2x1_L8_SUB
+    MY_ALIGN  
+
+
+CGEMM_L2x1_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L2x1_SUB2_2
+    LOAD2x1_2
+    KERNEL2x1_L2  16,32, 0,0
+    KERNEL2x1_E2  16,32, 1,1
+    MY_ALIGN
+
+
+CGEMM_L2x1_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L2x1_SUB2_1
+    LOAD2x1_2
+    KERNEL2x1_E2  16,32, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L2x1_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L2x1_SAVE 
+    KERNEL2x1
+
+    MY_ALIGN
+CGEMM_L2x1_SAVE:
+/*----------------------------------------*/  
+     
+    SAVE2x1
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,2
+#endif   
+
+
+CGEMM_L2x1_END:
+/*----------------------------------------*/   
+    slwi    T1, K,  4
+
+    add   B,  B,  T1
+#if defined(TRMMKERNEL) && !defined(LEFT)   
+    addi TEMP_REG, TEMP_REG, 2
+#endif   
+
+CGEMM_L2_END:
+
+
+b CGEMM_L1
+/*                MINI SUBROUTINES                            */      
+/*                1x8 MAIN 128x+2 LOOP                     */      
+
+
+CGEMM_L1x8_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD1x8_2 
+    MY_ALIGN
+CGEMM_L1x8_LOOP:
+/*----------------------------------------*/   
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL1x8_L2 128,16,0,0 
+CGEMM_L1x8_K128:
+/*----------------------------------------*/   
+    KERNEL1x8_L2 128,16,1,0
+    dcbt    AO, T2  
+    KERNEL1x8_L2 128,16,2,0
+    KERNEL1x8_L2 128,16,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL1x8_L2 128,16,4,0
+    KERNEL1x8_L2 128,16,5,0
+    dcbt    AO, T4  
+    KERNEL1x8_L2 128,16,6,0
+    KERNEL1x8_L2 128,16,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL1x8_L2 128,16,8,0
+    KERNEL1x8_L2 128,16,9,0
+    KERNEL1x8_L2 128,16,10,0
+    KERNEL1x8_L2 128,16,11,0  
+    dcbt    BO, T4
+    KERNEL1x8_L2 128,16,12,0
+    KERNEL1x8_L2 128,16,13,0
+    KERNEL1x8_L2 128,16,14,0
+    KERNEL1x8_L2 128,16,15,0  
+    KERNEL1x8_L2 128,16,16,0
+    KERNEL1x8_L2 128,16,17,0 
+    KERNEL1x8_L2 128,16,18,0
+    KERNEL1x8_L2 128,16,19,0  
+    KERNEL1x8_L2 128,16,20,0
+    KERNEL1x8_L2 128,16,21,0 
+    KERNEL1x8_L2 128,16,22,0
+    KERNEL1x8_L2 128,16,23,0   
+    KERNEL1x8_L2 128,16,24,0
+    KERNEL1x8_L2 128,16,25,0
+    KERNEL1x8_L2 128,16,26,0
+    KERNEL1x8_L2 128,16,27,0  
+    KERNEL1x8_L2 128,16,28,0
+    KERNEL1x8_L2 128,16,29,0
+    KERNEL1x8_L2 128,16,30,0
+    KERNEL1x8_L2 128,16,31,0 
+    KERNEL1x8_L2 128,16,32,0
+    KERNEL1x8_L2 128,16,33,0
+    KERNEL1x8_L2 128,16,34,0
+    KERNEL1x8_L2 128,16,35,0 
+    KERNEL1x8_L2 128,16,36,0
+    KERNEL1x8_L2 128,16,37,0
+    KERNEL1x8_L2 128,16,38,0
+    KERNEL1x8_L2 128,16,39,0  
+    KERNEL1x8_L2 128,16,40,0
+    KERNEL1x8_L2 128,16,41,0
+    KERNEL1x8_L2 128,16,42,0
+    KERNEL1x8_L2 128,16,43,0  
+    KERNEL1x8_L2 128,16,44,0
+    KERNEL1x8_L2 128,16,45,0
+    KERNEL1x8_L2 128,16,46,0
+    KERNEL1x8_L2 128,16,47,0 
+    KERNEL1x8_L2 128,16,48,0
+    KERNEL1x8_L2 128,16,49,0 
+    KERNEL1x8_L2 128,16,50,0
+    KERNEL1x8_L2 128,16,51,0  
+    KERNEL1x8_L2 128,16,52,0
+    KERNEL1x8_L2 128,16,53,0 
+    KERNEL1x8_L2 128,16,54,0
+    KERNEL1x8_L2 128,16,55,0  
+    KERNEL1x8_L2 128,16,56,0
+    KERNEL1x8_L2 128,16,57,0
+    KERNEL1x8_L2 128,16,58,0
+    KERNEL1x8_L2 128,16,59,0  
+    KERNEL1x8_L2 128,16,60,0
+    KERNEL1x8_L2 128,16,61,0
+    KERNEL1x8_L2 128,16,62,0 
+    KERNEL1x8_L2 128,16,63,1  
+    bdnz    CGEMM_L1x8_LOOP
+    MY_ALIGN  
+CGEMM_L1x8_LOOP_END:
+/*----------------------------------------*/   
+    END1x8_2
+    blr
+    MY_ALIGN
+
+
+CGEMM_1x8_L64_SUB:
+/*----------------------------------------*/   
+    LOAD1x8_2  
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL1x8_L2 128,16,0,0 
+    KERNEL1x8_L2 128,16,1,0
+    dcbt    AO, T2  
+    KERNEL1x8_L2 128,16,2,0
+    KERNEL1x8_L2 128,16,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL1x8_L2 128,16,4,0
+    KERNEL1x8_L2 128,16,5,0
+    dcbt    AO, T4  
+    KERNEL1x8_L2 128,16,6,0
+    KERNEL1x8_L2 128,16,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL1x8_L2 128,16,8,0
+    KERNEL1x8_L2 128,16,9,0
+    KERNEL1x8_L2 128,16,10,0
+    KERNEL1x8_L2 128,16,11,0  
+    dcbt    BO, T4
+    KERNEL1x8_L2 128,16,12,0
+    KERNEL1x8_L2 128,16,13,0
+    KERNEL1x8_L2 128,16,14,0
+    KERNEL1x8_L2 128,16,15,0  
+    KERNEL1x8_L2 128,16,16,0
+    KERNEL1x8_L2 128,16,17,0 
+    KERNEL1x8_L2 128,16,18,0
+    KERNEL1x8_L2 128,16,19,0  
+    KERNEL1x8_L2 128,16,20,0
+    KERNEL1x8_L2 128,16,21,0 
+    KERNEL1x8_L2 128,16,22,0
+    KERNEL1x8_L2 128,16,23,0   
+    KERNEL1x8_L2 128,16,24,0
+    KERNEL1x8_L2 128,16,25,0
+    KERNEL1x8_L2 128,16,26,0
+    KERNEL1x8_L2 128,16,27,0  
+    KERNEL1x8_L2 128,16,28,0
+    KERNEL1x8_L2 128,16,29,0
+    KERNEL1x8_L2 128,16,30,0
+    KERNEL1x8_E2 128,16,31,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_1x8_L32_SUB:
+/*----------------------------------------*/   
+    LOAD1x8_2  
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL1x8_L2 128,16,0,0 
+    KERNEL1x8_L2 128,16,1,0
+    dcbt    AO, T2  
+    KERNEL1x8_L2 128,16,2,0
+    KERNEL1x8_L2 128,16,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL1x8_L2 128,16,4,0
+    KERNEL1x8_L2 128,16,5,0
+    dcbt    AO, T4  
+    KERNEL1x8_L2 128,16,6,0
+    KERNEL1x8_L2 128,16,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL1x8_L2 128,16,8,0
+    KERNEL1x8_L2 128,16,9,0
+    KERNEL1x8_L2 128,16,10,0
+    KERNEL1x8_L2 128,16,11,0  
+    dcbt    BO, T4
+    KERNEL1x8_L2 128,16,12,0
+    KERNEL1x8_L2 128,16,13,0
+    KERNEL1x8_L2 128,16,14,0
+    KERNEL1x8_E2 128,16,15,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_1x8_L16_SUB:
+/*----------------------------------------*/   
+    LOAD1x8_2 
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL1x8_L2 128,16,0,0 
+    KERNEL1x8_L2 128,16,1,0
+    dcbt    AO, T2  
+    KERNEL1x8_L2 128,16,2,0
+    KERNEL1x8_L2 128,16,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL1x8_L2 128,16,4,0
+    KERNEL1x8_L2 128,16,5,0
+    dcbt    AO, T4  
+    KERNEL1x8_L2 128,16,6,0
+    KERNEL1x8_E2 128,16,7,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_1x4_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD1x4_2  
+    MY_ALIGN
+CGEMM_L1x4_LOOP:
+/*----------------------------------------*/   
+    KERNEL1x4_L2 64,16,0,0
+CGEMM_L1x4_K32:
+/*----------------------------------------*/   
+    KERNEL1x4_L2 64,16,1,0   
+    KERNEL1x4_L2 64,16,2,0
+    KERNEL1x4_L2 64,16,3,0  
+    KERNEL1x4_L2 64,16,4,0
+    KERNEL1x4_L2 64,16,5,0 
+    KERNEL1x4_L2 64,16,6,0
+    KERNEL1x4_L2 64,16,7,0
+    KERNEL1x4_L2 64,16,8,0
+    KERNEL1x4_L2 64,16,9,0   
+    KERNEL1x4_L2 64,16,10,0
+    KERNEL1x4_L2 64,16,11,0  
+    KERNEL1x4_L2 64,16,12,0
+    KERNEL1x4_L2 64,16,13,0 
+    KERNEL1x4_L2 64,16,14,0
+    KERNEL1x4_L2 64,16,15,1    
+    bdnz    CGEMM_L1x4_LOOP
+    MY_ALIGN  
+CGEMM_L1x4_LOOP_END:
+/*----------------------------------------*/   
+    END1x4_2 
+    blr
+    MY_ALIGN
+
+
+CGEMM_1x4_L16_SUB:
+/*----------------------------------------*/   
+    LOAD1x4_2
+    KERNEL1x4_L2 64,16,0,0
+    KERNEL1x4_L2 64,16,1,0   
+    KERNEL1x4_L2 64,16,2,0
+    KERNEL1x4_L2 64,16,3,0  
+    KERNEL1x4_L2 64,16,4,0
+    KERNEL1x4_L2 64,16,5,0 
+    KERNEL1x4_L2 64,16,6,0
+    KERNEL1x4_E2 64,16,7,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_1x4_L8_SUB:
+/*----------------------------------------*/   
+    LOAD1x4_2
+    KERNEL1x4_L2 64,16,0,0
+    KERNEL1x4_L2 64,16,1,0   
+    KERNEL1x4_L2 64,16,2,0
+    KERNEL1x4_E2 64,16,3,1 
+    blr
+
+
+CGEMM_1x2_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD1x2_2  
+    MY_ALIGN 
+CGEMM_L1x2_LOOP:
+/*----------------------------------------*/   
+    KERNEL1x2_L2 32,16,0,0 
+CGEMM_L1x2_K32:
+/*----------------------------------------*/   
+    KERNEL1x2_L2 32,16,1,0  
+    KERNEL1x2_L2 32,16,2,0
+    KERNEL1x2_L2 32,16,3,0  
+    KERNEL1x2_L2 32,16,4,0
+    KERNEL1x2_L2 32,16,5,0 
+    KERNEL1x2_L2 32,16,6,0
+    KERNEL1x2_L2 32,16,7,0
+    KERNEL1x2_L2 32,16,8,0
+    KERNEL1x2_L2 32,16,9,0  
+    KERNEL1x2_L2 32,16,10,0
+    KERNEL1x2_L2 32,16,11,0  
+    KERNEL1x2_L2 32,16,12,0
+    KERNEL1x2_L2 32,16,13,0 
+    KERNEL1x2_L2 32,16,14,0
+    KERNEL1x2_L2 32,16,15,1   
+    bdnz    CGEMM_L1x2_LOOP
+    MY_ALIGN  
+
+
+CGEMM_L1x2_LOOP_END:
+/*----------------------------------------*/   
+    END1x2_2 
+    blr
+    MY_ALIGN
+CGEMM_1x2_L16_SUB:
+/*----------------------------------------*/   
+    LOAD1x2_2
+    KERNEL1x2_L2 32,16,0,0
+    KERNEL1x2_L2 32,16,1,0  
+    KERNEL1x2_L2 32,16,2,0
+    KERNEL1x2_L2 32,16,3,0  
+    KERNEL1x2_L2 32,16,4,0
+    KERNEL1x2_L2 32,16,5,0 
+    KERNEL1x2_L2 32,16,6,0
+    KERNEL1x2_E2 32,16,7,1
+    blr
+    MY_ALIGN
+CGEMM_1x2_L8_SUB:
+/*----------------------------------------*/   
+    LOAD1x2_2
+    KERNEL1x2_L2 32,16,0,0
+    KERNEL1x2_L2 32,16,1,0  
+    KERNEL1x2_L2 32,16,2,0
+    KERNEL1x2_E2 32,16,3,1  
+    blr
+
+
+CGEMM_1x1_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD1x1_2  
+    MY_ALIGN
+CGEMM_L1x1_LOOP:
+/*----------------------------------------*/   
+    KERNEL1x1_L2 16,16,0,0 
+CGEMM_L1x1_K32:
+/*----------------------------------------*/   
+    KERNEL1x1_L2 16,16,1,0  
+    KERNEL1x1_L2 16,16,2,0
+    KERNEL1x1_L2 16,16,3,0  
+    KERNEL1x1_L2 16,16,4,0
+    KERNEL1x1_L2 16,16,5,0 
+    KERNEL1x1_L2 16,16,6,0
+    KERNEL1x1_L2 16,16,7,0
+    KERNEL1x1_L2 16,16,8,0
+    KERNEL1x1_L2 16,16,9,0  
+    KERNEL1x1_L2 16,16,10,0
+    KERNEL1x1_L2 16,16,11,0  
+    KERNEL1x1_L2 16,16,12,0
+    KERNEL1x1_L2 16,16,13,0 
+    KERNEL1x1_L2 16,16,14,0
+    KERNEL1x1_L2 16,16,15,1   
+    bdnz    CGEMM_L1x1_LOOP
+    MY_ALIGN  
+CGEMM_L1x1_LOOP_END:
+/*----------------------------------------*/   
+    END1x1_2 
+    blr
+
+    MY_ALIGN
+CGEMM_1x1_L16_SUB:
+/*----------------------------------------*/   
+    LOAD1x1_2
+    KERNEL1x1_L2 16,16,0,0
+    KERNEL1x1_L2 16,16,1,0  
+    KERNEL1x1_L2 16,16,2,0
+    KERNEL1x1_L2 16,16,3,0  
+    KERNEL1x1_L2 16,16,4,0
+    KERNEL1x1_L2 16,16,5,0 
+    KERNEL1x1_L2 16,16,6,0
+    KERNEL1x1_E2 16,16,7,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_1x1_L8_SUB:
+/*----------------------------------------*/   
+    LOAD1x1_2
+    KERNEL1x1_L2 16,16,0,0
+    KERNEL1x1_L2 16,16,1,0  
+    KERNEL1x1_L2 16,16,2,0
+    KERNEL1x1_E2 16,16,3,1  
+    blr
+
+
+
+/*             MAIN LOOP BEGINS               */   
+    MY_ALIGN
+
+
+CGEMM_L1:
+/*----------------------------------------*/   
+
+    andi.    J,  N,  1
+    ble   CGEMM_L1_END
+
+CGEMM_L1_BEGIN:
+/*----------------------------------------*/   
+    mr    CO, C  
+    add     T2,C,LDC    
+    mr    AO, A  
+    add   C,  C,  T1
+#if defined(TRMMKERNEL) && defined(LEFT)   
+    mr TEMP_REG, OFFSET  /*off = offset;*/
+#endif     
+    srawi.    I,  M,  3
+    ble   CGEMM_L1x8_END
+    dcbt    CO,r0  /*just prefetch*/
+    dcbt    T2,r0    
+
+
+CGEMM_L1x8_BEGIN:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,1
+#else    
+    mr    BO, B  
+    dcbt    B,  r0  
+#endif     
+    dcbt    AO, r0
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,8,1
+    mr T1, T6
+/* TEMPS FOR PREFETCH */   
+    li T2, 1024
+    li T3, 1024+512
+    addi T1,T1, -2
+/* TEMPS FOR PREFETCH */     
+    li T4, 2048
+    li T5, 2048+512   
+    srawi.   T8, T1, 7 /**(T1-2) % 128x */
+#else   
+    mr T1, K
+/* TEMPS FOR PREFETCH */   
+    li T2, 1024
+    li T3, 1024+512
+    addi T1,T1, -2
+/* TEMPS FOR PREFETCH */     
+    li T4, 2048
+    li T5, 2048+512 
+    srawi.   T8, T1, 7 /**(K-2) % 128x */
+#endif   
+    ZERO1x8  
+    ble   CGEMM_L1x8_SUB0
+    bl CGEMM_L1x8_LMAIN_SUB
+    andi.   L,  T1, 127
+    ble   CGEMM_L1x8_SAVE
+    b   CGEMM_L1x8_SUB2
+
+
+CGEMM_L1x8_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 255
+    cmpwi   T6,129
+#else   
+    andi.   L,  K,  255
+    cmpwi   K,129
+#endif       
+    li T8,1
+    bne CMP1x8_128K
+    addi BO,BO,-8
+    addi AO,AO,-64 
+    LOAD1x8O 64,8 
+    END1x8_WITHOUT_ADD   
+    LOAD1x8_2O  128, 16 
+    mtctr   T8    
+    bl CGEMM_L1x8_K128   
+    b CGEMM_L1x8_SAVE  
+    CMP1x8_128K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,128
+#else    
+    cmpwi   K,128
+#endif        
+    bne CGEMM_L1x8_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-16
+    addi AO,AO,-128   
+    LOAD1x8_2O 128,16
+    bl CGEMM_L1x8_K128   
+    b CGEMM_L1x8_SAVE 
+    MY_ALIGN
+
+
+CGEMM_L1x8_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 64
+    ble CGEMM_L1x8_SUB2_32
+    bl  CGEMM_1x8_L64_SUB
+    MY_ALIGN
+
+
+CGEMM_L1x8_SUB2_32:
+/*----------------------------------------*/   
+    andi.      T1,L, 32
+    ble CGEMM_L1x8_SUB2_16    
+    bl  CGEMM_1x8_L32_SUB
+    MY_ALIGN 
+
+
+CGEMM_L1x8_SUB2_16:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L1x8_SUB2_8
+    bl  CGEMM_1x8_L16_SUB  
+    MY_ALIGN    
+
+
+CGEMM_L1x8_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L1x8_SUB2_4
+    LOAD1x8_2
+    KERNEL1x8_L2  128,16, 0,0
+    KERNEL1x8_L2  128,16, 1,0
+    KERNEL1x8_L2  128,16, 2,0
+    KERNEL1x8_E2  128,16, 3,1
+    MY_ALIGN   
+
+
+CGEMM_L1x8_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L1x8_SUB2_2
+    LOAD1x8_2
+    KERNEL1x8_L2  128,16, 0,0
+    KERNEL1x8_E2  128,16, 1,1
+    MY_ALIGN
+
+
+CGEMM_L1x8_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L1x8_SUB2_1
+    LOAD1x8_2 
+    KERNEL1x8_E2  128,16, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L1x8_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L1x8_SAVE 
+    KERNEL1x8
+
+    MY_ALIGN
+CGEMM_L1x8_SAVE:
+/*----------------------------------------*/   
+    addic.    I,  I,  -1
+    MY_ALIGN
+    SAVE1x8
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,1
+#endif     
+    bgt   CGEMM_L1x8_BEGIN
+    andi.   T2, M,  7
+    ble   CGEMM_L1x1_END
+    andi.   T1, M,  4
+    ble   CGEMM_L1x4_END
+    b   CGEMM_L1x4_BEGIN
+    MY_ALIGN 
+
+
+CGEMM_L1x8_END:
+/*----------------------------------------*/   
+
+
+CGEMM_L1x4_BEGIN:
+/*----------------------------------------*/   
+    andi.   T2, M,  7
+    ble   CGEMM_L1x1_END
+    andi.   T1, M,  4
+    ble   CGEMM_L1x4_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,1
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,4,1
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T1-2) % 31x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 31x */
+#endif     
+    ZERO1x4
+    ble   CGEMM_L1x4_SUB0 
+    bl CGEMM_1x4_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   CGEMM_L1x4_SAVE
+    b    CGEMM_L1x4_SUB2
+
+
+CGEMM_L1x4_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP1x4_32K
+    addi BO,BO,-8
+    addi AO,AO,-32  
+    LOAD1x4O 32,8 
+    END1x4_WITHOUT_ADD   
+    LOAD1x4_2O  64, 16 
+    mtctr   T8    
+    bl CGEMM_L1x4_K32   
+    b CGEMM_L1x4_SAVE  
+    CMP1x4_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne CGEMM_L1x4_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-16
+    addi AO,AO,-64   
+    LOAD1x4_2O 64,16
+    bl CGEMM_L1x4_K32   
+    b CGEMM_L1x4_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+CGEMM_L1x4_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L1x4_SUB2_8
+    bl  CGEMM_1x4_L16_SUB  
+    MY_ALIGN
+
+
+CGEMM_L1x4_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L1x4_SUB2_4
+    bl CGEMM_1x4_L8_SUB
+    MY_ALIGN  
+
+
+CGEMM_L1x4_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L1x4_SUB2_2
+    LOAD1x4_2
+    KERNEL1x4_L2  64,16, 0,0
+    KERNEL1x4_E2  64,16, 1,1
+    MY_ALIGN
+
+
+CGEMM_L1x4_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L1x4_SUB2_1
+    LOAD1x4_2
+    KERNEL1x4_E2  64,16, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L1x4_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L1x4_SAVE 
+    KERNEL1x4
+
+
+CGEMM_L1x4_SAVE:
+/*----------------------------------------*/   
+    SAVE1x4
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,1
+#endif     
+
+
+CGEMM_L1x4_END:
+/*----------------------------------------*/   
+
+
+CGEMM_L1x2_BEGIN:
+/*----------------------------------------*/   
+    andi.   T1, M,  2
+    ble   CGEMM_L1x2_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,1
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,2,1
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T1-2) % 31x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 31x */
+#endif     
+    ZERO1x2
+    ble   CGEMM_L1x2_SUB0 
+    bl CGEMM_1x2_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   CGEMM_L1x2_SAVE
+    b   CGEMM_L1x2_SUB2
+
+
+CGEMM_L1x2_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP1x2_32K
+    addi BO,BO,-8
+    addi AO,AO,-16  
+    LOAD1x2O 16,8 
+    END1x2_WITHOUT_ADD   
+    LOAD1x2_2O  32, 16  
+    mtctr   T8    
+    bl CGEMM_L1x2_K32   
+    b CGEMM_L1x2_SAVE  
+    CMP1x2_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne CGEMM_L1x2_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-16
+    addi AO,AO,-32   
+    LOAD1x2_2O 32,16
+    bl CGEMM_L1x2_K32   
+    b CGEMM_L1x2_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+CGEMM_L1x2_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L1x2_SUB2_8
+    bl CGEMM_1x2_L16_SUB  
+    MY_ALIGN
+
+
+CGEMM_L1x2_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L1x2_SUB2_4
+    bl CGEMM_1x2_L8_SUB
+    MY_ALIGN  
+
+
+CGEMM_L1x2_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L1x2_SUB2_2
+    LOAD1x2_2
+    KERNEL1x2_L2  32,16, 0,0
+    KERNEL1x2_E2  32,16, 1,1
+    MY_ALIGN
+
+
+CGEMM_L1x2_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L1x2_SUB2_1
+    LOAD1x2_2
+    KERNEL1x2_E2  32,16, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L1x2_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L1x2_SAVE 
+    KERNEL1x2
+
+    MY_ALIGN
+CGEMM_L1x2_SAVE:
+/*----------------------------------------*/   
+    SAVE1x2
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,1
+#endif     
+
+
+CGEMM_L1x2_END:
+/*----------------------------------------*/   
+
+
+CGEMM_L1x1_BEGIN:
+/*----------------------------------------*/   
+    andi.   T1, M,  1
+    ble   CGEMM_L1x1_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,1
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,1,1
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T1-2) % 31x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 31x */
+#endif     
+    ZERO1x1
+    ble   CGEMM_L1x1_SUB0 
+    bl CGEMM_1x1_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   CGEMM_L1x1_SAVE
+    b   CGEMM_L1x1_SUB2
+
+
+CGEMM_L1x1_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP1x1_32K
+    addi BO,BO,-8
+    addi AO,AO,-8  
+    LOAD1x1O 8,8 
+    END1x1_WITHOUT_ADD   
+    LOAD1x1_2O  16, 16  
+    mtctr   T8    
+    bl CGEMM_L1x1_K32   
+    b CGEMM_L1x1_SAVE  
+    CMP1x1_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne CGEMM_L1x1_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-16
+    addi AO,AO,-16   
+    LOAD1x1_2O 16,16
+    bl CGEMM_L1x1_K32   
+    b CGEMM_L1x1_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+CGEMM_L1x1_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L1x1_SUB2_8
+    bl CGEMM_1x1_L16_SUB  
+    MY_ALIGN
+
+
+CGEMM_L1x1_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L1x1_SUB2_4
+    bl CGEMM_1x1_L8_SUB
+    MY_ALIGN  
+
+
+CGEMM_L1x1_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L1x1_SUB2_2
+    LOAD1x1_2
+    KERNEL1x1_L2  16,16, 0,0
+    KERNEL1x1_E2  16,16, 1,1
+    MY_ALIGN
+
+
+CGEMM_L1x1_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L1x1_SUB2_1
+    LOAD1x1_2
+    KERNEL1x1_E2  16,16, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L1x1_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L1x1_SAVE 
+    KERNEL1x1
+
+    MY_ALIGN
+CGEMM_L1x1_SAVE:
+/*----------------------------------------*/  
+     
+    SAVE1x1
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,1
+#endif   
+
+
+CGEMM_L1x1_END:
+/*----------------------------------------*/   
+    slwi    T1, K,  3
+
+    add   B,  B,  T1
+#if defined(TRMMKERNEL) && !defined(LEFT)   
+    addi TEMP_REG, TEMP_REG, 1
+#endif   
+
+CGEMM_L1_END:
+
+
+
+
diff --git a/kernel/power/cgemm_macros_power10.S b/kernel/power/cgemm_macros_power10.S

new file mode 100644 (file)

index 0000000..b66e934
--- /dev/null
+++ b/kernel/power/cgemm_macros_power10.S
@@ -0,0 +1,2131 @@
+/***************************************************************************
+Copyright (c) 2013-2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define unit_size 8
+#define DISP32(ind, disp) (ind*unit_size*32+disp)
+#define DISP16(ind, disp) (ind*unit_size*16+disp)
+#define DISP8(ind, disp) (ind*unit_size*8+disp)
+#define DISP4(ind, disp) (ind*unit_size*4+disp)
+#define DISP2(ind, disp) (ind*unit_size*2+disp)
+#define DISP1(ind, disp) (ind*unit_size+disp)
+#define DISPX(disp)  (disp)
+
+.macro AGGREGATE_REALS_IMAGES  VSINR_OUT1, VSINR, VSINI_OUT2, VSINI
+#if    defined(NN) || defined(NT) || defined(TN) || defined(TT)
+       xvsubsp  \VSINR_OUT1, \VSINR_OUT1, \VSINR
+       xvaddsp  \VSINI_OUT2, \VSINI_OUT2, \VSINI
+#elif  defined(CN) || defined(CT) || defined(RN) || defined(RT)
+       xvaddsp  \VSINR_OUT1, \VSINR_OUT1, \VSINR
+       xvsubsp  \VSINI_OUT2, \VSINI_OUT2, \VSINI
+#elif  defined(NC) || defined(TC) || defined(NR) || defined(TR)
+       xvaddsp  \VSINR_OUT1, \VSINR_OUT1, \VSINR
+       xvsubsp  \VSINI_OUT2, \VSINI, \VSINI_OUT2
+#else  // CC || CR || RC || RR
+    /*we will assume {-alpha_r,-alpha_i} for this case */
+    /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/
+       xvsubsp  \VSINR_OUT1, \VSINR, \VSINR_OUT1
+    /*we will negate alpha image   instead to fix sign*/
+       xvaddsp  \VSINI_OUT2, \VSINI_OUT2, \VSINI
+#endif
+.endm
+
+.macro AGGREGATE_REALS_IMAGES_A_PERMUTE  VSINR_OUT1, VSINR, VSINI_OUT2, VSINI
+#if    defined(NN) || defined(NT) || defined(TN) || defined(TT)
+       xvsubsp  \VSINR_OUT1, \VSINR_OUT1, \VSINR
+       xvaddsp  \VSINI_OUT2, \VSINI_OUT2, \VSINI
+#elif  defined(CN) || defined(CT) || defined(RN) || defined(RT)
+       xvaddsp  \VSINR_OUT1, \VSINR_OUT1, \VSINR
+       xvsubsp  \VSINI_OUT2, \VSINI, \VSINI_OUT2
+#elif  defined(NC) || defined(TC) || defined(NR) || defined(TR)
+       xvaddsp  \VSINR_OUT1, \VSINR_OUT1, \VSINR
+       xvsubsp  \VSINI_OUT2, \VSINI_OUT2, \VSINI
+#else  // CC || CR || RC || RR
+    /*we will assume {-alpha_r,-alpha_i} for this case */
+    /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/
+       xvsubsp  \VSINR_OUT1, \VSINR, \VSINR_OUT1
+    /*we will negate alpha image   instead to fix sign*/
+       xvaddsp  \VSINI_OUT2, \VSINI_OUT2, \VSINI
+#endif
+.endm
+
+/* {i0,i1} * {alpha_i,alpha_i} [- VSOUT1] ;[VSOUT2 +] {r0,r1}*{alpha_i,alpha_i} */
+
+.macro MULT_APLHA_PART1  VSINRR, VSINII, VSOUT1, VSOUT2
+       xvmulsp \VSOUT1, \VSINII, alpha_i
+       xvmulsp  \VSOUT2, \VSINRR, alpha_i
+.endm
+
+/*   {r0,r1} * {alpha_r,alpha_r} -  VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */
+
+.macro MULT_APLHA_PART2  VSINRR, VSINII, VSOUT1, VSOUT2
+       xvmsubasp  \VSOUT1, \VSINRR, alpha_r
+       xvmaddasp \VSOUT2, \VSINII, alpha_r
+.endm
+
+.macro PERMUTE1        OUT, R1, R2, R3, R4
+       xxsel   vs62, \R1, \R2, vs57
+       xxsel   \OUT, \R3, \R4, vs57
+       xxpermdi        \OUT, \OUT, vs62, 1
+.endm
+.macro PERMUTE2        OUT, R1, R2, R3, R4
+       xxsel   vs62, \R2, \R1, vs57
+       xxsel   \OUT, \R4, \R3, vs57
+       xxpermdi        \OUT, vs62, \OUT, 1
+       xxperm  \OUT, \OUT, permute_mask
+.endm
+.macro PERMUTE3        OUT, R1, R2, R3, R4
+       xxsel   vs62, \R1, \R2, vs57
+       xxsel   \OUT, \R3, \R4, vs57
+       xxpermdi \OUT, vs62, \OUT, 2
+.endm
+.macro PERMUTE4        OUT, R1, R2, R3, R4
+       xxsel   vs62, \R2, \R1, vs57
+       xxsel   \OUT, \R4, \R3, vs57
+       xxpermdi        \OUT, \OUT, vs62, 2
+       xxperm  \OUT, \OUT, permute_mask
+.endm
+.macro GROUP1
+       xxperm  vs0, vs32, permute_mask
+       xxperm  vs4, vs40, permute_mask
+       xxperm  vs1, vs33, permute_mask
+       xxperm  vs5, vs41, permute_mask
+       xxperm  vs8, vs36, permute_mask
+       xxperm  vs12, vs44, permute_mask
+       xxperm  vs9, vs37, permute_mask
+       xxperm  vs13, vs45, permute_mask
+.endm
+.macro AGG_GROUP1
+       AGGREGATE_REALS_IMAGES  vs32, vs0, vs40, vs4
+       AGGREGATE_REALS_IMAGES  vs33, vs1, vs41, vs5
+       AGGREGATE_REALS_IMAGES  vs36, vs8, vs44, vs12
+       AGGREGATE_REALS_IMAGES  vs37, vs9, vs45, vs13
+.endm
+.macro GROUP2
+       xxperm  vs0, vs34, permute_mask
+       xxperm  vs4, vs42, permute_mask
+       xxperm  vs1, vs35, permute_mask
+       xxperm  vs5, vs43, permute_mask
+       xxperm  vs8, vs38, permute_mask
+       xxperm  vs12, vs46, permute_mask
+       xxperm  vs9, vs39, permute_mask
+       xxperm  vs13, vs47, permute_mask
+.endm
+.macro AGG_GROUP2
+       AGGREGATE_REALS_IMAGES  vs34, vs0, vs42, vs4
+       AGGREGATE_REALS_IMAGES  vs35, vs1, vs43, vs5
+       AGGREGATE_REALS_IMAGES  vs38, vs8, vs46, vs12
+       AGGREGATE_REALS_IMAGES  vs39, vs9, vs47, vs13
+.endm
+.macro MULTIPLY_GROUP1
+       MULT_APLHA_PART1        vs32, vs40, vs0, vs1
+       MULT_APLHA_PART1        vs33, vs41, vs2, vs3
+       MULT_APLHA_PART1        vs36, vs44, vs8, vs9
+       MULT_APLHA_PART1        vs37, vs45, vs10, vs11
+       MULT_APLHA_PART2        vs32, vs40, vs0, vs1
+       MULT_APLHA_PART2        vs33, vs41, vs2, vs3
+       MULT_APLHA_PART2        vs36, vs44, vs8, vs9
+       MULT_APLHA_PART2        vs37, vs45, vs10, vs11
+.endm
+.macro MULTIPLY_GROUP2
+       MULT_APLHA_PART1        vs34, vs42, vs4, vs5
+       MULT_APLHA_PART1        vs35, vs43, vs6, vs7
+       MULT_APLHA_PART1        vs38, vs46, vs12, vs13
+       MULT_APLHA_PART1        vs39, vs47, vs14, vs15
+       MULT_APLHA_PART2        vs34, vs42, vs4, vs5
+       MULT_APLHA_PART2        vs35, vs43, vs6, vs7
+       MULT_APLHA_PART2        vs38, vs46, vs12, vs13
+       MULT_APLHA_PART2        vs39, vs47, vs14, vs15
+.endm
+/* reconstruct r, i pairs*/
+.macro RECONSTRUCT_PAIR1
+       xxperm  vs0, vs1, save_permute_1
+       xxperm  vs2, vs3, save_permute_1
+       xxperm  vs8, vs9, save_permute_1
+       xxperm  vs10, vs11, save_permute_1
+.endm
+.macro RECONSTRUCT_PAIR2
+       xxperm  vs4, vs5, save_permute_1
+       xxperm  vs6, vs7, save_permute_1
+       xxperm  vs12, vs13, save_permute_1
+       xxperm  vs14, vs15, save_permute_1
+.endm
+.macro SHUFFLE_ACC     ACC, R0, R1, R2, R3, O1, O2, O3, O4
+       xxmfacc \ACC
+       PERMUTE1        \O1, \R3, \R2, \R1, \R0
+       PERMUTE2        \O2, \R1, \R0, \R3, \R2
+       PERMUTE3        \O3, \R1, \R0, \R3, \R2
+       PERMUTE4        \O4, \R3, \R2, \R1, \R0
+.endm
+/*                                             macros for N=4 and M=8
+**********************************************************************************************/
+.macro ZERO4x8
+       xxsetaccz       0
+       xxsetaccz       1
+       xxsetaccz       2
+       xxsetaccz       3
+       xxsetaccz       4
+       xxsetaccz       5
+       xxsetaccz       6
+       xxsetaccz       7
+.endm
+
+.macro LOAD4x8
+       LOAD4x8O        0, 0
+.endm
+
+.macro LOAD4x8O  OffsetA, OffsetB
+       lxvp    vs34, (\OffsetB+0)(BO)
+       lxvp    vs32, (\OffsetA+0)(AO)
+       lxvp    vs36, (\OffsetA+32)(AO)
+.endm
+
+.macro END4x8_NORMAL
+       END4x8  AO, BO, 64, 32
+.endm
+
+.macro END4x8_WITHOUT_ADD
+       END4x8  AO, BO, 0, 0
+.endm
+
+.macro END4x8  AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+       addi    \BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+       addi    \AREG, \AREG, \OffsetA
+.endif
+       xvf32gerpp      3, 36, 35
+       xvf32gerpp      2, 37, 35
+       xvf32gerpp      1, 32, 35
+       xvf32gerpp      0, 33, 35
+       xvf32gerpp      7, 36, 34
+       xvf32gerpp      6, 37, 34
+       xvf32gerpp      5, 32, 34
+       xvf32gerpp      4, 33, 34
+.endm
+
+.macro LOAD4x8_2
+       LOAD4x8_2O      0, 0
+.endm
+
+.macro LOAD4x8_2O  OffsetA, OffsetB
+       lxvp    vs34, (\OffsetB)(BO)
+       lxvp    vs38, (32+\OffsetB)(BO)
+       lxvp    vs32, (0+\OffsetA)(AO)
+       lxvp    vs36, (32+\OffsetA)(AO)
+       lxvp    vs40, (64+\OffsetA)(AO)
+       lxvp    vs42, (64+32+\OffsetA)(AO)
+.endm
+
+.macro END4x8_2
+       /*for load2 offset will be 128 and 64*/
+       KERNEL4x8_2     AO, BO, 128, 64, 0, 1, 1
+.endm
+
+.macro KERNEL4x8_E2    OffsetA, OffsetB, Index, IsLast
+       KERNEL4x8_2     AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
+.endm
+
+.macro KERNEL4x8_L2    OffsetA, OffsetB, Index, IsLast
+       KERNEL4x8_2     AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
+.endm
+
+.macro KERNEL4x8_2     AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+       xvf32gerpp      3, 36, 35
+       xvf32gerpp      2, 37, 35
+       xvf32gerpp      1, 32, 35
+       xvf32gerpp      0, 33, 35
+       xvf32gerpp      7, 36, 34
+       xvf32gerpp      6, 37, 34
+       xvf32gerpp      5, 32, 34
+       xvf32gerpp      4, 33, 34
+.if \Complete==0
+       lxvp    vs34, DISP8(\Index, \OffsetB)(\BREG)
+       lxvp    vs32, DISP16(\Index, 0+\OffsetA)(\AREG)
+       lxvp    vs36, DISP16(\Index, 32+\OffsetA)(\AREG)
+.endif
+       xvf32gerpp      3, 42, 39
+       xvf32gerpp      2, 43, 39
+       xvf32gerpp      1, 40, 39
+       xvf32gerpp      0, 41, 39
+       xvf32gerpp      7, 42, 38
+       xvf32gerpp      6, 43, 38
+       xvf32gerpp      5, 40, 38
+       xvf32gerpp      4, 41, 38
+.if \Complete==0
+       lxvp    vs40, DISP16(\Index, 64+\OffsetA)(\AREG)
+       lxvp    vs38, DISP8(\Index, 32+\OffsetB)(\BREG)
+       lxvp    vs42, DISP16(\Index, 64+32+\OffsetA)(\AREG)
+.endif
+.if \IsLast==1
+.if \Complete==1
+       addi    \BREG, \BREG, DISP8(\Index, \OffsetB)
+       addi    \AREG, \AREG, DISP16(\Index, \OffsetA)
+.else
+       addi    \BREG, \BREG, DISP8(\Index, 64)
+       addi    \AREG, \AREG, DISP16(\Index, 128)
+.endif
+.endif
+.endm
+
+.macro KERNEL4x8
+       LOAD4x8
+       END4x8  AO, BO, 64, 32
+.endm
+
+.macro SAVE4x8
+       SHUFFLE_ACC     0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
+       SHUFFLE_ACC     1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45
+       SHUFFLE_ACC     2, vs8, vs9, vs10, vs11, vs34, vs42, vs38, vs46
+       SHUFFLE_ACC     3, vs12, vs13, vs14, vs15, vs35, vs43, vs39, vs47
+       SHUFFLE_ACC     4, vs16, vs17, vs18, vs19, vs48, vs56, vs52, vs60
+       SHUFFLE_ACC     5, vs20, vs21, vs22, vs23, vs49, vs16, vs53, vs61
+       SHUFFLE_ACC     7, vs28, vs29, vs30, vs31, vs17, vs19, vs18, vs20
+       SHUFFLE_ACC     6, vs24, vs25, vs26, vs27, vs50, vs58, vs54, vs21
+       add     T4, LDC, LDC
+       add     T1, CO, LDC
+#ifndef TRMMKERNEL
+       lxvp    vs24, 0(CO)
+#endif
+#ifndef TRMMKERNEL
+       lxvp    vs26, 32(CO)
+#endif
+#ifndef TRMMKERNEL
+       lxvp    vs28, 0(T1)
+#endif
+       xxperm  vs2, vs34, permute_mask
+       xxperm  vs6, vs42, permute_mask
+#ifndef TRMMKERNEL
+       lxvp    vs30, 32(T1)
+#endif
+       xxperm  vs3, vs35, permute_mask
+       xxperm  vs7, vs43, permute_mask
+       add     T2, CO, T4
+       add     T3, T1, T4
+       GROUP1
+       AGG_GROUP1
+       AGGREGATE_REALS_IMAGES  vs34, vs2, vs42, vs6
+       xxperm  vs10, vs38, permute_mask
+       xxperm  vs14, vs46, permute_mask
+       AGGREGATE_REALS_IMAGES  vs35, vs3, vs43, vs7
+       xxperm  vs11, vs39, permute_mask
+       xxperm  vs15, vs47, permute_mask
+       xxperm  vs0, vs48, permute_mask
+       xxperm  vs4, vs56, permute_mask
+       xxperm  vs1, vs49, permute_mask
+       xxperm  vs5, vs16, permute_mask
+       AGGREGATE_REALS_IMAGES  vs38, vs10, vs46, vs14
+       xxperm  vs2, vs50, permute_mask
+       xxperm  vs6, vs58, permute_mask
+       AGGREGATE_REALS_IMAGES  vs39, vs11, vs47, vs15
+       xxperm  vs3, vs17, permute_mask
+       xxperm  vs7, vs19, permute_mask
+       AGGREGATE_REALS_IMAGES  vs48, vs0, vs56, vs4
+       xxperm  vs8, vs52, permute_mask
+       xxperm  vs12, vs60, permute_mask
+       AGGREGATE_REALS_IMAGES  vs49, vs1, vs16, vs5
+       xxperm  vs9, vs53, permute_mask
+       xxperm  vs13, vs61, permute_mask
+       AGGREGATE_REALS_IMAGES  vs50, vs2, vs58, vs6
+       xxperm  vs10, vs54, permute_mask
+       xxperm  vs14, vs21, permute_mask
+       AGGREGATE_REALS_IMAGES  vs17, vs3, vs19, vs7
+       xxperm  vs11, vs18, permute_mask
+       xxperm  vs15, vs20, permute_mask
+       AGGREGATE_REALS_IMAGES  vs52, vs8, vs60, vs12
+       AGGREGATE_REALS_IMAGES  vs53, vs9, vs61, vs13
+/*VSINRR, VSINII, VSOUT1, VSOUT2*/
+       MULT_APLHA_PART1    vs32, vs40, vs0, vs1
+       AGGREGATE_REALS_IMAGES  vs54, vs10, vs21, vs14
+       MULT_APLHA_PART1    vs33, vs41, vs2, vs3
+       AGGREGATE_REALS_IMAGES  vs18, vs11, vs20, vs15
+       MULT_APLHA_PART1    vs34, vs42, vs4, vs5
+       MULT_APLHA_PART1    vs35, vs43, vs6, vs7
+       MULT_APLHA_PART2    vs32, vs40, vs0, vs1
+       MULT_APLHA_PART2    vs33, vs41, vs2, vs3
+       MULT_APLHA_PART2    vs34, vs42, vs4, vs5
+       MULT_APLHA_PART2    vs35, vs43, vs6, vs7
+#ifndef TRMMKERNEL
+       lxvp    vs32, 0(T2)
+#endif
+       MULT_APLHA_PART1    vs36, vs44, vs8, vs9
+       MULT_APLHA_PART1    vs37, vs45, vs10, vs11
+#ifndef TRMMKERNEL
+       lxvp    vs40, 32(T2)
+#endif
+       MULT_APLHA_PART1    vs38, vs46, vs12, vs13
+       MULT_APLHA_PART1    vs39, vs47, vs14, vs15
+#ifndef TRMMKERNEL
+       lxvp    vs34, 0(T3)
+#endif
+       MULT_APLHA_PART2    vs36, vs44, vs8, vs9
+       MULT_APLHA_PART2    vs37, vs45, vs10, vs11
+#ifndef TRMMKERNEL
+       lxvp    vs42, 32(T3)
+#endif
+       MULT_APLHA_PART2    vs38, vs46, vs12, vs13
+       MULT_APLHA_PART2    vs39, vs47, vs14, vs15
+       RECONSTRUCT_PAIR1
+       RECONSTRUCT_PAIR2
+#ifndef TRMMKERNEL
+       /* add */
+       xxpermdi        vs1, vs8, vs0, 2
+       xxpermdi        vs3, vs10, vs2, 2
+       xxpermdi        vs5, vs12, vs4, 2
+       xxpermdi        vs7, vs14, vs6, 2
+       xxpermdi        vs9, vs0, vs8, 2
+       xxpermdi        vs11, vs2, vs10, 2
+       xvaddsp vs24, vs24, vs3
+       xvaddsp vs25, vs25, vs1
+       xxpermdi        vs13, vs4, vs12, 2
+       xxpermdi        vs15, vs6, vs14, 2
+       xvaddsp vs26, vs26, vs7
+       xvaddsp vs27, vs27, vs5
+       xvaddsp vs28, vs28, vs11
+       xvaddsp vs29, vs29, vs9
+       xvaddsp vs30, vs30, vs15
+       xvaddsp vs31, vs31, vs13
+#else
+       xxpermdi        vs25, vs8, vs0, 2
+       xxpermdi        vs24, vs10, vs2, 2
+       xxpermdi        vs27, vs12, vs4, 2
+       xxpermdi        vs26, vs14, vs6, 2
+       xxpermdi        vs29, vs0, vs8, 2
+       xxpermdi        vs28, vs2, vs10, 2
+       xxpermdi        vs31, vs4, vs12, 2
+       xxpermdi        vs30, vs6, vs14, 2
+#endif
+       stxvp   vs24, 0(CO)
+       MULT_APLHA_PART1    vs48, vs56, vs0, vs1
+       MULT_APLHA_PART1    vs49, vs16, vs2, vs3
+       stxvp   vs26, 32(CO)
+       MULT_APLHA_PART1    vs50, vs58, vs4, vs5
+       MULT_APLHA_PART1    vs17, vs19, vs6, vs7
+       stxvp   vs28, 0(T1)
+       MULT_APLHA_PART2    vs48, vs56, vs0, vs1
+       MULT_APLHA_PART2    vs49, vs16, vs2, vs3
+       stxvp   vs30, 32(T1)
+       MULT_APLHA_PART2    vs50, vs58, vs4, vs5
+       MULT_APLHA_PART2    vs17, vs19, vs6, vs7
+       MULT_APLHA_PART1    vs52, vs60, vs8, vs9
+       MULT_APLHA_PART1    vs53, vs61, vs10, vs11
+       MULT_APLHA_PART1    vs54, vs21, vs12, vs13
+       MULT_APLHA_PART1    vs18, vs20, vs14, vs15
+       MULT_APLHA_PART2    vs52, vs60, vs8, vs9
+       MULT_APLHA_PART2    vs53, vs61, vs10, vs11
+       MULT_APLHA_PART2    vs54, vs21, vs12, vs13
+       MULT_APLHA_PART2    vs18, vs20, vs14, vs15
+       RECONSTRUCT_PAIR1
+       RECONSTRUCT_PAIR2
+#ifndef TRMMKERNEL
+  /* add */
+       xxpermdi        vs1, vs8, vs0, 2
+       xxpermdi        vs3, vs10, vs2, 2
+       xxpermdi        vs5, vs12, vs4, 2
+       xxpermdi        vs7, vs14, vs6, 2
+       xxpermdi        vs9, vs0, vs8, 2
+       xxpermdi        vs11, vs2, vs10, 2
+       xvaddsp vs32, vs32, vs3
+       xvaddsp vs33, vs33, vs1
+       xxpermdi        vs13, vs4, vs12, 2
+       xxpermdi        vs15, vs6, vs14, 2
+       xvaddsp vs40, vs40, vs7
+       xvaddsp vs41, vs41, vs5
+       xvaddsp vs34, vs34, vs11
+       xvaddsp vs35, vs35, vs9
+       xvaddsp vs42, vs42, vs15
+       xvaddsp vs43, vs43, vs13
+#else
+       xxpermdi        vs33, vs8, vs0, 2
+       xxpermdi        vs32, vs10, vs2, 2
+       xxpermdi        vs41, vs12, vs4, 2
+       xxpermdi        vs40, vs14, vs6, 2
+       xxpermdi        vs35, vs0, vs8, 2
+       xxpermdi        vs34, vs2, vs10, 2
+       xxpermdi        vs43, vs4, vs12, 2
+       xxpermdi        vs42, vs6, vs14, 2
+#endif
+       stxvp   vs32, 0(T2)
+       stxvp   vs40, 32(T2)
+       stxvp   vs34, 0(T3)
+       stxvp   vs42, 32(T3)
+       addi    CO, CO, 64
+.endm
+
+/*                                             macros for N=4 and M=4
+**********************************************************************************************/
+
+.macro ZERO4x4
+       xxsetaccz       0
+       xxsetaccz       1
+       xxsetaccz       2
+       xxsetaccz       3
+.endm
+
+.macro LOAD4x4
+       LOAD4x4O 0, 0
+.endm
+
+.macro LOAD4x4O  OffsetA, OffsetB
+       lxvp    vs34, (\OffsetB+0)(BO)
+       lxvp    vs32, (\OffsetA+0)(AO)
+.endm
+
+.macro END4x4_NORMAL
+       END4x4 AO, BO, 32, 32
+.endm
+
+.macro END4x4_WITHOUT_ADD
+       END4x4 AO, BO, 0, 0
+.endm
+
+.macro END4x4  AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+       addi    \BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+       addi    \AREG, \AREG, \OffsetA
+.endif
+       xvf32gerpp      3, 32, 34
+       xvf32gerpp      2, 33, 34
+       xvf32gerpp      1, 32, 35
+       xvf32gerpp      0, 33, 35
+.endm
+
+.macro LOAD4x4_2
+       LOAD4x4_2O 0, 0
+.endm
+
+.macro LOAD4x4_2O  OffsetA, OffsetB
+       lxvp    vs34, (\OffsetB)(BO)
+       lxvp    vs38, (32+\OffsetB)(BO)
+       lxvp    vs32, (0+\OffsetA)(AO)
+       lxvp    vs36, (32+\OffsetA)(AO)
+.endm
+
+.macro END4x4_2
+  /*for load2 offset will be 64 and 64*/
+       KERNEL4x4_2     AO, BO, 64, 64, 0, 1, 1
+.endm
+
+.macro KERNEL4x4_E2    OffsetA, OffsetB, Index, IsLast
+       KERNEL4x4_2     AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
+.endm
+
+.macro KERNEL4x4_L2    OffsetA, OffsetB, Index, IsLast
+       KERNEL4x4_2     AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
+.endm
+
+.macro KERNEL4x4_2     AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+       xvf32gerpp      3, 32, 34
+       xvf32gerpp      2, 33, 34
+       xvf32gerpp      1, 32, 35
+       xvf32gerpp      0, 33, 35
+.if \Complete==0
+       lxvp    vs34, DISP8(\Index, \OffsetB)(\BREG)
+       lxvp    vs32, DISP8(\Index, 0+\OffsetA)(\AREG)
+.endif
+       xvf32gerpp      3, 36, 38
+       xvf32gerpp      2, 37, 38
+       xvf32gerpp      1, 36, 39
+       xvf32gerpp      0, 37, 39
+.if \Complete==0
+       lxvp    vs38, DISP8(\Index, 32+\OffsetB)(\BREG)
+       lxvp    vs36, DISP8(\Index, 32+\OffsetA)(\AREG)
+.endif
+.if \IsLast==1
+.if \Complete==1
+       addi            \BREG, \BREG, DISP8(\Index, \OffsetB)
+       addi    \AREG, \AREG, DISP8(\Index, \OffsetA)
+.else
+       addi            \BREG, \BREG, DISP8(\Index, 64)
+       addi    \AREG, \AREG, DISP8(\Index, 64)
+.endif
+.endif
+.endm
+
+.macro KERNEL4x4
+       LOAD4x4
+       END4x4  AO, BO, 32, 32
+.endm
+
+.macro SAVE4x4
+       SHUFFLE_ACC     0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
+       SHUFFLE_ACC     1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45
+       SHUFFLE_ACC     2, vs8, vs9, vs10, vs11, vs34, vs42, vs38, vs46
+       SHUFFLE_ACC     3, vs12, vs13, vs14, vs15, vs35, vs43, vs39, vs47
+       add     T4, LDC, LDC
+       add     T1, CO, LDC
+#ifndef TRMMKERNEL
+       lxvp    vs24, 0(CO)
+#endif
+       add     T2, CO, T4
+       add     T3, T1, T4
+#ifndef TRMMKERNEL
+       lxvp    vs26, 0(T1)
+#endif
+ #ifndef TRMMKERNEL
+       lxvp    vs28, 0(T2)
+#endif
+#ifndef TRMMKERNEL
+       lxvp    vs30, 0(T3)
+#endif
+       GROUP1
+       AGG_GROUP1
+       GROUP2
+       AGG_GROUP2
+  /*VSINRR, VSINII, VSOUT1, VSOUT2*/
+       MULTIPLY_GROUP1
+       MULTIPLY_GROUP2
+/* reconstruct r, i pairs*/
+       RECONSTRUCT_PAIR1
+       RECONSTRUCT_PAIR2
+#ifndef TRMMKERNEL
+  /* add */
+       xxpermdi        vs1, vs8, vs0, 2
+       xxpermdi        vs3, vs10, vs2, 2
+       xxpermdi        vs9, vs0, vs8, 2
+       xxpermdi        vs11, vs2, vs10, 2
+       xxpermdi        vs5, vs12, vs4, 2
+       xxpermdi        vs7, vs14, vs6, 2
+       xxpermdi        vs13, vs4, vs12, 2
+       xxpermdi        vs15, vs6, vs14, 2
+       xvaddsp vs24, vs24, vs3
+       xvaddsp vs25, vs25, vs1
+       xvaddsp vs26, vs26, vs11
+       xvaddsp vs27, vs27, vs9
+       xvaddsp vs28, vs28, vs7
+       xvaddsp vs29, vs29, vs5
+       xvaddsp vs30, vs30, vs15
+       xvaddsp vs31, vs31, vs13
+#else
+       xxpermdi        vs25, vs8, vs0, 2
+       xxpermdi        vs24, vs10, vs2, 2
+       xxpermdi        vs27, vs0, vs8, 2
+       xxpermdi        vs26, vs2, vs10, 2
+       xxpermdi        vs29, vs12, vs4, 2
+       xxpermdi        vs28, vs14, vs6, 2
+       xxpermdi        vs31, vs4, vs12, 2
+       xxpermdi        vs30, vs6, vs14, 2
+#endif
+       stxvp   vs24, 0(CO)
+       stxvp   vs26, 0(T1)
+       stxvp   vs28, 0(T2)
+       stxvp   vs30, 0(T3)
+       addi  CO, CO, 32
+.endm
+
+/*                                             macros for N=4 and M=2
+**********************************************************************************************/
+
+.macro ZERO4x2
+       xxsetaccz       0
+       xxsetaccz       1
+.endm
+
+.macro LOAD4x2
+       LOAD4x2O 0, 0
+.endm
+
+.macro LOAD4x2O  OffsetA, OffsetB
+       lxv     vs32, (\OffsetA+0)(AO)
+       lxvp    vs34, (\OffsetB+0)(BO)
+.endm
+
+.macro END4x2_NORMAL
+       END4x2 AO, BO, 16, 32
+.endm
+
+.macro END4x2_WITHOUT_ADD
+       END4x2 AO, BO, 0, 0
+.endm
+
+.macro END4x2  AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+       addi    \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+       addi    \AREG, \AREG, \OffsetA
+.endif
+       xvf32gerpp      1, 34, 32
+       xvf32gerpp      0, 35, 32
+.endm
+
+.macro LOAD4x2_2
+       LOAD4x2_2O 0, 0
+.endm
+
+.macro LOAD4x2_2O  OffsetA, OffsetB
+       lxvp    vs32, (\OffsetA)(AO)
+       lxvp    vs34, (0+\OffsetB)(BO)
+       lxvp    vs36, (32+\OffsetB)(BO)
+.endm
+
+.macro END4x2_2
+  /*for load2 offset will be 32 and 64*/
+       KERNEL4x2_2     AO, BO, 32, 64, 0, 1, 1
+.endm
+
+.macro KERNEL4x2_E2    OffsetA, OffsetB, Index, IsLast
+       KERNEL4x2_2     AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
+.endm
+
+.macro KERNEL4x2_L2    OffsetA, OffsetB, Index, IsLast
+       KERNEL4x2_2     AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
+.endm
+
+.macro KERNEL4x2_2     AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+       xvf32gerpp      1, 34, 33
+       xvf32gerpp      0, 35, 33
+.if \Complete==0
+       lxvp    vs34, DISP8(\Index, 0+\OffsetB)(\BREG)
+.endif
+       xvf32gerpp      1, 36, 32
+       xvf32gerpp      0, 37, 32
+.if \Complete==0
+       lxvp    vs32, DISP4(\Index, \OffsetA)(\AREG)
+       lxvp    vs36, DISP8(\Index, 32+\OffsetB)(\BREG)
+.endif
+.if \IsLast==1
+.if \Complete==1
+       addi    \AREG, \AREG, DISP4(\Index, \OffsetA)
+       addi            \BREG, \BREG, DISP8(\Index, \OffsetB)
+.else
+       addi    \AREG, \AREG, DISP4(\Index, 32)
+       addi            \BREG, \BREG, DISP8(\Index, 64)
+.endif
+.endif
+.endm
+
+.macro KERNEL4x2
+       LOAD4x2
+       END4x2  AO, BO, 16, 32
+.endm
+
+.macro SAVE4x2
+       SHUFFLE_ACC     0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
+       SHUFFLE_ACC     1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45
+       add     T4, LDC, LDC
+       add     T1, CO, LDC
+       add     T2, CO, T4
+       add     T3, T1, T4
+#ifndef TRMMKERNEL
+       lxv     vs24, 0(CO)
+#endif
+#ifndef TRMMKERNEL
+       lxv     vs25, 0(T1)
+#endif
+#ifndef TRMMKERNEL
+       lxv     vs26, 0(T2)
+#endif
+#ifndef TRMMKERNEL
+       lxv     vs27, 0(T3)
+#endif
+       GROUP1
+       AGGREGATE_REALS_IMAGES_A_PERMUTE vs32, vs0, vs40, vs4
+       AGGREGATE_REALS_IMAGES_A_PERMUTE vs33, vs1, vs41, vs5
+       AGGREGATE_REALS_IMAGES_A_PERMUTE vs36, vs8, vs44, vs12
+       AGGREGATE_REALS_IMAGES_A_PERMUTE vs37, vs9, vs45, vs13
+  /*VSINRR, VSINII, VSOUT1, VSOUT2*/
+       MULTIPLY_GROUP1
+/* reconstruct r, i pairs*/
+       RECONSTRUCT_PAIR1
+#ifndef TRMMKERNEL
+  /* add */
+       xxpermdi        vs1, vs8, vs0, 0
+       xxpermdi        vs9, vs10, vs2, 0
+       xxpermdi        vs3, vs0, vs8, 3
+       xxpermdi        vs11, vs2, vs10, 3
+       xvaddsp vs24, vs24, vs1
+       xvaddsp vs26, vs26, vs9
+       xvaddsp vs25, vs25, vs3
+       xvaddsp vs27, vs27, vs11
+#else
+       xxpermdi        vs24, vs8, vs0, 0
+       xxpermdi        vs26, vs10, vs2, 0
+       xxpermdi        vs25, vs0, vs8, 3
+       xxpermdi        vs27, vs2, vs10, 3
+#endif
+       stxv    vs24, 0(CO)
+       stxv    vs25, 0(T1)
+       stxv    vs26, 0(T2)
+       stxv    vs27, 0(T3)
+       addi  CO, CO, 16
+.endm
+
+/*                                             macros for N=4 and M=2
+**********************************************************************************************/
+
+.macro ZERO4x1
+       xxsetaccz       0
+       xxsetaccz       1
+.endm
+
+.macro LOAD4x1
+       LOAD4x1O 0, 0
+.endm
+
+.macro LOAD4x1O  OffsetA, OffsetB
+       lxsd    v0, (\OffsetA+0)(AO)
+       lxvp    vs34, (\OffsetB+0)(BO)
+.endm
+
+.macro END4x1_NORMAL
+       END4x1 AO, BO,8, 32
+.endm
+
+.macro END4x1_WITHOUT_ADD
+       END4x1 AO, BO, 0, 0
+.endm
+
+.macro END4x1 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+       addi  \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+       addi  \AREG, \AREG, \OffsetA
+.endif
+       xvf32gerpp          0, 35, 32
+       xvf32gerpp          1, 34, 32
+.endm
+
+.macro LOAD4x1_2
+       LOAD4x1_2O 0, 0
+.endm
+
+.macro LOAD4x1_2O  OffsetA, OffsetB
+       lxv     vs32, (\OffsetA)(AO)
+       vspltisb        v6, 0
+       xxpermdi        vs33, vs32, vs38, 0
+       xxpermdi        vs32, vs32, vs38, 2
+       lxvp    vs34, (0+\OffsetB)(BO)
+       lxvp    vs36, (32+\OffsetB)(BO)
+.endm
+
+.macro END4x1_2
+  /*for load2 offset will be 16 and 64*/
+       KERNEL4x1_2  AO, BO, 16, 64, 0, 1, 1
+.endm
+
+.macro KERNEL4x1_E2 OffsetA, OffsetB, Index, IsLast
+       KERNEL4x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
+.endm
+
+.macro KERNEL4x1_L2 OffsetA, OffsetB, Index, IsLast
+       KERNEL4x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
+.endm
+
+.macro KERNEL4x1_2  AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+       xvf32gerpp          0, 35, 32
+       xvf32gerpp          1, 34, 32
+.if \Complete==0
+       lxvp    vs34, DISP8(\Index, 0+\OffsetB)(\BREG)
+.endif
+       xvf32gerpp          0, 37, 33
+       xvf32gerpp          1, 36, 33
+.if \Complete==0
+       lxv     vs32, DISP2(\Index, \OffsetA)(\AREG)
+       lxvp    vs36, DISP8(\Index, 32+\OffsetB)(\BREG)
+       xxpermdi        vs33, vs32, vs38, 0
+       xxpermdi        vs32, vs32, vs38, 2
+.endif
+.if \IsLast==1
+.if \Complete==1
+       addi    \AREG, \AREG, DISP2(\Index, \OffsetA)
+       addi    \BREG, \BREG, DISP8(\Index, \OffsetB)
+.else
+       addi    \AREG, \AREG, DISP2(\Index, 16)
+       addi    \BREG, \BREG, DISP8(\Index, 64)
+.endif
+.endif
+.endm
+
+.macro KERNEL4x1
+       LOAD4x1
+       END4x1  AO, BO, 8, 32
+.endm
+
+.macro SAVE4x1
+       SHUFFLE_ACC     0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
+       SHUFFLE_ACC     1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45
+       xxpermdi        vs32, vs32, vs36, 1
+       xxpermdi        vs40, vs40, vs44, 1
+       xxpermdi        vs33, vs33, vs37, 1
+       xxpermdi        vs41, vs41, vs45, 1
+       add     T4, LDC, LDC
+       add     T1, CO, LDC
+       add     T2, CO, T4
+       add     T3, T1, T4
+#ifndef TRMMKERNEL
+       lxsd    v4, 0(CO)
+#endif
+#ifndef TRMMKERNEL
+       lxsd    v5, 0(T1)
+#endif
+#ifndef TRMMKERNEL
+       lxsd    v6, 0(T2)
+#endif
+#ifndef TRMMKERNEL
+       lxsd    v7, 0(T3)
+#endif
+       xxperm  vs0, vs32, permute_mask
+       xxperm  vs4, vs40, permute_mask
+       xxperm  vs1, vs33, permute_mask
+       xxperm  vs5, vs41, permute_mask
+       AGGREGATE_REALS_IMAGES_A_PERMUTE vs32, vs0, vs40, vs4
+       AGGREGATE_REALS_IMAGES_A_PERMUTE vs33, vs1, vs41, vs5
+  /*VSINRR, VSINII, VSOUT1, VSOUT2*/
+       MULT_APLHA_PART1    vs32, vs40, vs0, vs1
+       MULT_APLHA_PART1    vs33, vs41, vs2, vs3
+       MULT_APLHA_PART2    vs32, vs40, vs0, vs1
+       MULT_APLHA_PART2    vs33, vs41, vs2, vs3
+/* reconstruct r, i pairs*/
+       xxperm  vs0, vs1, save_permute_1
+       xxperm  vs2, vs3, save_permute_1
+#ifndef TRMMKERNEL
+  /* add */
+       xxspltd vs1, vs0, 0
+       xxspltd vs3, vs0, 1
+       xxspltd vs9, vs2, 0
+       xxspltd vs11, vs2, 1
+ /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/
+       xvaddsp vs36, vs36, vs1
+       xvaddsp vs37, vs37, vs3
+       xvaddsp vs38, vs38, vs9
+       xvaddsp vs39, vs39, vs11
+#else
+ /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/
+       xxspltd vs36, vs0, 0
+       xxspltd vs37, vs0, 1
+       xxspltd vs38, vs2, 0
+       xxspltd vs39, vs2, 1
+#endif
+       stxsd   v4, 0(CO)
+       stxsd   v5, 0(T1)
+       stxsd   v6, 0(T2)
+       stxsd   v7, 0(T3)
+       addi  CO, CO, 8
+.endm
+
+/*                                             macros for N=2 and M=8
+**********************************************************************************************/
+
+.macro ZERO2x8
+       xxsetaccz       0
+       xxsetaccz       1
+       xxsetaccz       2
+       xxsetaccz       3
+.endm
+
+.macro LOAD2x8
+       LOAD2x8O 0, 0
+.endm
+
+.macro LOAD2x8O  OffsetA, OffsetB
+       lxv     vs34, (\OffsetB+0)(BO)
+       lxvp    vs32, (\OffsetA+0)(AO)
+       lxvp    vs36, (\OffsetA+32)(AO)
+.endm
+
+.macro END2x8_NORMAL
+       END2x8 AO, BO, 64, 16
+.endm
+
+.macro END2x8_WITHOUT_ADD
+       END2x8 AO, BO, 0, 0
+.endm
+
+.macro END2x8 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+       addi  \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+       addi  \AREG, \AREG, \OffsetA
+.endif
+       xvf32gerpp      2, 37, 34
+       xvf32gerpp      3, 36, 34
+       xvf32gerpp      0, 33, 34
+       xvf32gerpp      1, 32, 34
+.endm
+
+.macro LOAD2x8_2
+       LOAD2x8_2O 0, 0
+.endm
+
+.macro LOAD2x8_2O  OffsetA, OffsetB
+       lxvp    vs34, (\OffsetB)(BO)
+       lxvp    vs32, (0+\OffsetA)(AO)
+       lxvp    vs36, (32+\OffsetA)(AO)
+       lxvp    vs38, (64+\OffsetA)(AO)
+       lxvp    vs40, (64+32+\OffsetA)(AO)
+.endm
+
+.macro END2x8_2
+  /*for load2 offset will be 128 and 32*/
+       KERNEL2x8_2  AO, BO, 128, 32, 0, 1, 1
+.endm
+
+.macro KERNEL2x8_E2 OffsetA, OffsetB, Index, IsLast
+       KERNEL2x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
+.endm
+
+.macro KERNEL2x8_L2 OffsetA, OffsetB, Index, IsLast
+       KERNEL2x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
+.endm
+
+.macro KERNEL2x8_2  AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+       xvf32gerpp      2, 37, 35
+       xvf32gerpp      3, 36, 35
+       xvf32gerpp      0, 33, 35
+       xvf32gerpp      1, 32, 35
+
+.if \Complete==0
+       lxvp    vs32, DISP16(\Index, 0+\OffsetA)(\AREG)
+       lxvp    vs36, DISP16(\Index, 32+\OffsetA)(\AREG)
+.endif
+       xvf32gerpp      2, 41, 34
+       xvf32gerpp      3, 40, 34
+       xvf32gerpp      0, 39, 34
+       xvf32gerpp      1, 38, 34
+
+.if \Complete==0
+       lxvp    vs34, DISP4(\Index, \OffsetB)(\BREG)
+       lxvp    vs38, DISP16(\Index, 64+\OffsetA)(\AREG)
+       lxvp    vs40, DISP16(\Index, 64+32+\OffsetA)(\AREG)
+.endif
+.if \IsLast==1
+.if \Complete==1
+       addi    \BREG, \BREG, DISP4(\Index, \OffsetB)
+       addi    \AREG, \AREG, DISP16(\Index, \OffsetA)
+.else
+       addi    \BREG, \BREG, DISP4(\Index, 32)
+       addi    \AREG, \AREG, DISP16(\Index, 128)
+.endif
+.endif
+.endm
+
+.macro KERNEL2x8
+       LOAD2x8
+       END2x8  AO, BO, 64, 16
+.endm
+
+.macro SAVE2x8
+       SHUFFLE_ACC     0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
+       SHUFFLE_ACC     1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45
+       SHUFFLE_ACC     2, vs8, vs9, vs10, vs11, vs34, vs42, vs38, vs46
+       SHUFFLE_ACC     3, vs12, vs13, vs14, vs15, vs35, vs43, vs39, vs47
+       add     T1, CO, LDC
+#ifndef TRMMKERNEL
+       lxvp    vs24, 0(CO)
+#endif
+#ifndef TRMMKERNEL
+       lxvp    vs26, 32(CO)
+#endif
+#ifndef TRMMKERNEL
+       lxvp    vs28, 0(T1)
+#endif
+#ifndef TRMMKERNEL
+       lxvp    vs30, 32(T1)
+#endif
+       add     T2, CO, T4
+       add     T3, T1, T4
+       GROUP1
+       AGG_GROUP1
+       GROUP2
+       AGG_GROUP2
+  /*VSINRR, VSINII, VSOUT1, VSOUT2*/
+       MULTIPLY_GROUP1
+       MULTIPLY_GROUP2
+/* reconstruct r, i pairs*/
+       RECONSTRUCT_PAIR1
+       RECONSTRUCT_PAIR2
+#ifndef TRMMKERNEL
+  /* add */
+       xxpermdi        vs1, vs8, vs0, 2
+       xxpermdi        vs3, vs10, vs2, 2
+       xxpermdi        vs5, vs12, vs4, 2
+       xxpermdi        vs7, vs14, vs6, 2
+       xxpermdi        vs9, vs0, vs8, 2
+       xxpermdi        vs11, vs2, vs10, 2
+       xvaddsp vs24, vs24, vs3
+       xvaddsp vs25, vs25, vs1
+       xxpermdi        vs13, vs4, vs12, 2
+       xxpermdi        vs15, vs6, vs14, 2
+       xvaddsp vs26, vs26, vs7
+       xvaddsp vs27, vs27, vs5
+       xvaddsp vs28, vs28, vs11
+       xvaddsp vs29, vs29, vs9
+       xvaddsp vs30, vs30, vs15
+       xvaddsp vs31, vs31, vs13
+#else
+       xxpermdi        vs25, vs8, vs0, 2
+       xxpermdi        vs24, vs10, vs2, 2
+       xxpermdi        vs27, vs12, vs4, 2
+       xxpermdi        vs26, vs14, vs6, 2
+       xxpermdi        vs29, vs0, vs8, 2
+       xxpermdi        vs28, vs2, vs10, 2
+       xxpermdi        vs31, vs4, vs12, 2
+       xxpermdi        vs30, vs6, vs14, 2
+#endif
+       stxvp   vs24, 0(CO)
+       stxvp   vs26, 32(CO)
+       stxvp   vs28, 0(T1)
+       stxvp   vs30, 32(T1)
+       addi  CO, CO, 64
+.endm
+
+/*                                             macros for N=2 and M=4
+**********************************************************************************************/
+
+.macro ZERO2x4
+       xxsetaccz       0
+       xxsetaccz       1
+.endm
+
+.macro LOAD2x4
+       LOAD2x4O 0, 0
+.endm
+
+.macro LOAD2x4O  OffsetA, OffsetB
+       lxv     vs34, (\OffsetB+0)(BO)
+       lxvp    vs32, (\OffsetA+0)(AO)
+.endm
+
+.macro END2x4_NORMAL
+       END2x4 AO, BO, 32, 16
+.endm
+
+.macro END2x4_WITHOUT_ADD
+       END2x4 AO, BO, 0, 0
+.endm
+
+.macro END2x4 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+       addi  \BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+       addi  \AREG, \AREG, \OffsetA
+.endif
+       xvf32gerpp      0, 33, 34
+       xvf32gerpp      1, 32, 34
+.endm
+
+.macro LOAD2x4_2
+       LOAD2x4_2O 0, 0
+.endm
+
+.macro LOAD2x4_2O  OffsetA, OffsetB
+       lxvp    vs34, (\OffsetB)(BO)
+       lxvp    vs32, (0+\OffsetA)(AO)
+       lxvp    vs36, (32+\OffsetA)(AO)
+.endm
+
+.macro END2x4_2
+  /*for load2 offset will be 64 and 32*/
+       KERNEL2x4_2  AO, BO, 64, 32, 0, 1, 1
+.endm
+
+.macro KERNEL2x4_E2 OffsetA, OffsetB, Index, IsLast
+       KERNEL2x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
+.endm
+
+.macro KERNEL2x4_L2 OffsetA, OffsetB, Index, IsLast
+       KERNEL2x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
+.endm
+
+.macro KERNEL2x4_2  AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+       xvf32gerpp      0, 33, 35
+       xvf32gerpp      1, 32, 35
+.if \Complete==0
+       lxvp    vs32, DISP8(\Index, 0+\OffsetA)(\AREG)
+.endif
+       xvf32gerpp      0, 37, 34
+       xvf32gerpp      1, 36, 34
+.if \Complete==0
+       lxvp    vs34, DISP4(\Index, \OffsetB)(\BREG)
+       lxvp    vs36, DISP8(\Index, 32+\OffsetA)(\AREG)
+.endif
+.if \IsLast==1
+.if \Complete==1
+       addi    \BREG, \BREG, DISP4(\Index, \OffsetB)
+       addi    \AREG, \AREG, DISP8(\Index, \OffsetA)
+.else
+       addi    \BREG, \BREG, DISP4(\Index, 32)
+       addi    \AREG, \AREG, DISP8(\Index, 64)
+.endif
+.endif
+.endm
+
+.macro KERNEL2x4
+       LOAD2x4
+       END2x4  AO, BO, 32, 16
+.endm
+
+.macro SAVE2x4
+       SHUFFLE_ACC     0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
+       SHUFFLE_ACC     1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45
+       add     T1, CO, LDC
+#ifndef TRMMKERNEL
+       lxvp    vs24, 0(CO)
+#endif
+#ifndef TRMMKERNEL
+       lxvp    vs26, 0(T1)
+#endif
+       GROUP1
+       AGG_GROUP1
+  /*VSINRR, VSINII, VSOUT1, VSOUT2*/
+       MULTIPLY_GROUP1
+/* reconstruct r, i pairs*/
+       RECONSTRUCT_PAIR1
+#ifndef TRMMKERNEL
+  /* add */
+       xxpermdi        vs1, vs8, vs0, 2
+       xxpermdi        vs3, vs10, vs2, 2
+       xxpermdi        vs9, vs0, vs8, 2
+       xxpermdi        vs11, vs2, vs10, 2
+       xvaddsp vs24, vs24, vs3
+       xvaddsp vs25, vs25, vs1
+       xvaddsp vs26, vs26, vs11
+       xvaddsp vs27, vs27, vs9
+#else
+       xxpermdi        vs25, vs8, vs0, 2
+       xxpermdi        vs24, vs10, vs2, 2
+       xxpermdi        vs27, vs0, vs8, 2
+       xxpermdi        vs26, vs2, vs10, 2
+#endif
+       stxvp   vs24, 0(CO)
+       stxvp   vs26, 0(T1)
+       addi  CO, CO, 32
+.endm
+
+/*                                             macros for N=2 and M=2
+**********************************************************************************************/
+
+.macro ZERO2x2
+       xxsetaccz       0
+.endm
+
+.macro LOAD2x2
+       LOAD2x2O 0, 0
+.endm
+
+.macro LOAD2x2O  OffsetA, OffsetB
+       lxv     vs32, (\OffsetA+0)(AO)
+       lxv     vs34, (\OffsetB+0)(BO)
+.endm
+
+.macro END2x2_NORMAL
+       END2x2 AO, BO, 16, 16
+.endm
+
+.macro END2x2_WITHOUT_ADD
+       END2x2 AO, BO, 0, 0
+.endm
+
+.macro END2x2 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+       addi  \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+       addi  \AREG, \AREG, \OffsetA
+.endif
+       xvf32gerpp      0, 34, 32
+.endm
+
+.macro LOAD2x2_2
+       LOAD2x2_2O 0, 0
+.endm
+
+.macro LOAD2x2_2O  OffsetA, OffsetB
+       lxvp    vs32, (\OffsetA)(AO)
+       lxvp    vs34, (0+\OffsetB)(BO)
+.endm
+
+.macro END2x2_2
+  /*for load2 offset will be 32 and 32*/
+       KERNEL2x2_2  AO, BO, 32, 32, 0, 1, 1
+.endm
+
+.macro KERNEL2x2_E2 OffsetA, OffsetB, Index, IsLast
+       KERNEL2x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
+.endm
+
+.macro KERNEL2x2_L2 OffsetA, OffsetB, Index, IsLast
+       KERNEL2x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
+.endm
+
+.macro KERNEL2x2_2  AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+       xvf32gerpp      0, 34, 32
+       xvf32gerpp      0, 35, 33
+.if \Complete==0
+       lxvp    vs32, DISP4(\Index, \OffsetA)(\AREG)
+       lxvp    vs34, DISP4(\Index, \OffsetA)(\BREG)
+.endif
+.if \IsLast==1
+.if \Complete==1
+       addi    \AREG, \AREG, DISP4(\Index, \OffsetA)
+       addi    \BREG, \BREG, DISP4(\Index, \OffsetB)
+.else
+       addi    \AREG, \AREG, DISP4(\Index, 32)
+       addi    \BREG, \BREG, DISP4(\Index, 32)
+.endif
+.endif
+.endm
+
+.macro KERNEL2x2
+       LOAD2x2
+       END2x2  AO, BO, 16, 16
+.endm
+
+.macro SAVE2x2
+       SHUFFLE_ACC     0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
+       add     T1, CO, LDC
+#ifndef TRMMKERNEL
+       lxv     vs24, 0(CO)
+#endif
+#ifndef TRMMKERNEL
+       lxv     vs26, 0(T1)
+#endif
+       xxperm  vs0, vs32, permute_mask
+       xxperm  vs4, vs40, permute_mask
+       xxperm  vs8, vs36, permute_mask
+       xxperm  vs12, vs44, permute_mask
+       AGGREGATE_REALS_IMAGES_A_PERMUTE vs32, vs0, vs40, vs4
+       AGGREGATE_REALS_IMAGES_A_PERMUTE vs36, vs8, vs44, vs12
+  /*VSINRR, VSINII, VSOUT1, VSOUT2*/
+       MULT_APLHA_PART1    vs32, vs40, vs0, vs1
+       MULT_APLHA_PART1    vs36, vs44, vs8, vs9
+       MULT_APLHA_PART2    vs32, vs40, vs0, vs1
+       MULT_APLHA_PART2    vs36, vs44, vs8, vs9
+/* reconstruct r, i pairs*/
+       xxperm  vs0, vs1, save_permute_1
+       xxperm  vs8, vs9, save_permute_1
+#ifndef TRMMKERNEL
+  /* add */
+       xxpermdi        vs1, vs8, vs0, 0
+       xxpermdi        vs9, vs0, vs8, 3
+       xvaddsp vs24, vs24, vs1
+       xvaddsp vs26, vs26, vs9
+#else
+       xxpermdi        vs24, vs8, vs0, 0
+       xxpermdi        vs26, vs0, vs8, 3
+#endif
+       stxv    vs24, 0(CO)
+       stxv    vs26, 0(T1)
+       addi  CO, CO, 16
+.endm
+
+/*                                             macros for N=2 and M=1
+**********************************************************************************************/
+
+.macro ZERO2x1
+       xxlxor  vs32, vs32, vs32
+       xxlxor  vs40, vs40, vs40
+.endm
+
+.macro LOAD2x1
+       LOAD2x1O 0, 0
+.endm
+
+.macro LOAD2x1O  OffsetA, OffsetB
+       lxsd    v4, (\OffsetA+0)(AO)
+       lxv     vs0, (\OffsetB+0)(BO)
+       xxspltd  vs24, vs36, 0
+       xxperm    vs26, vs24, permute_mask
+.endm
+
+.macro END2x1_NORMAL
+       END2x1 AO, BO,8, 16
+.endm
+
+.macro END2x1_WITHOUT_ADD
+       END2x1 AO, BO, 0, 0
+.endm
+
+.macro END2x1 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+       addi  \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+       addi  \AREG, \AREG, \OffsetA
+.endif
+       xvmaddasp       vs32, vs0, vs24
+       xvmaddasp       vs40, vs0, vs26
+.endm
+
+.macro LOAD2x1_2
+       LOAD2x1_2O 0, 0
+.endm
+
+.macro LOAD2x1_2O  OffsetA, OffsetB
+       lxv     vs27, (\OffsetA)(AO)
+       lxvp    vs4, (0+\OffsetB)(BO)
+       xxspltd  vs8, vs27, 1
+       xxspltd  vs24, vs27, 0
+       xxperm    vs10, vs8, permute_mask
+       xxperm    vs26, vs24, permute_mask
+.endm
+
+.macro END2x1_2
+  /*for load2 offset will be 16 and 32*/
+       KERNEL2x1_2  AO, BO, 16, 32, 0, 1, 1
+.endm
+
+.macro KERNEL2x1_E2 OffsetA, OffsetB, Index, IsLast
+       KERNEL2x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
+.endm
+
+.macro KERNEL2x1_L2 OffsetA, OffsetB, Index, IsLast
+       KERNEL2x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
+.endm
+
+.macro KERNEL2x1_2  AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+       xvmaddasp       vs32, vs5, vs8
+       xvmaddasp       vs40, vs5, vs10
+.if \Complete==0
+       lxv     vs27, DISP2(\Index, \OffsetA)(\AREG)
+       xxspltd  vs8, vs27, 1
+.endif
+.if \Complete==0
+       xxperm    vs10, vs8, permute_mask
+.endif
+       xvmaddasp       vs32, vs4, vs24
+       xvmaddasp       vs40, vs4, vs26
+.if \Complete==0
+       xxspltd  vs24, vs27, 0
+       xxperm   vs26, vs24, permute_mask
+.endif
+.if \Complete==0
+       lxvp    vs4, DISP4(\Index, 0+\OffsetB)(\BREG)
+.endif
+.if \IsLast==1
+.if \Complete==1
+       addi    \AREG, \AREG, DISP2(\Index, \OffsetA)
+       addi    \BREG, \BREG, DISP4(\Index, \OffsetB)
+.else
+       addi    \AREG, \AREG, DISP2(\Index, 16)
+       addi    \BREG, \BREG, DISP4(\Index, 32)
+.endif
+.endif
+.endm
+
+.macro KERNEL2x1
+       LOAD2x1
+       END2x1  AO, BO, 8, 16
+.endm
+
+.macro SAVE2x1
+       add     T1, CO, LDC
+#ifndef TRMMKERNEL
+       lxsd    v4, 0(CO)
+#endif
+#ifndef TRMMKERNEL
+       lxsd    v5, 0(T1)
+#endif
+       xxperm  vs0, vs32, permute_mask
+       xxperm  vs4, vs40, permute_mask
+       AGGREGATE_REALS_IMAGES_A_PERMUTE vs32, vs0, vs40, vs4
+       AGGREGATE_REALS_IMAGES_A_PERMUTE vs33, vs1, vs41, vs5
+  /*VSINRR, VSINII, VSOUT1, VSOUT2*/
+       MULT_APLHA_PART1    vs32, vs40, vs0, vs1
+       MULT_APLHA_PART2    vs32, vs40, vs0, vs1
+/* reconstruct r, i pairs*/
+       xxperm  vs0, vs1, save_permute_1
+#ifndef TRMMKERNEL
+  /* add */
+       xxspltd vs1, vs0, 0
+       xxspltd vs3, vs0, 1
+ /*--v4==vs36 v5==vs37---*/
+       xvaddsp vs36, vs36, vs1
+       xvaddsp vs37, vs37, vs3
+#else
+ /*--v4==vs36 v5==vs37---*/
+       xxspltd vs36, vs0, 0
+       xxspltd vs37, vs0, 1
+#endif
+       stxsd   v4, 0(CO)
+       stxsd   v5, 0(T1)
+       addi  CO, CO, 8
+.endm
+
+/*                                             macros for N=1 and M=8
+**********************************************************************************************/
+
+.macro ZERO1x8
+       xxsetaccz       0
+       xxsetaccz       1
+       xxsetaccz       2
+       xxsetaccz       3
+.endm
+
+.macro LOAD1x8
+       LOAD1x8O 0, 0
+.endm
+
+.macro LOAD1x8O  OffsetA, OffsetB
+       lxsd    v2, (\OffsetB+0)(BO)
+       lxvp    vs32, (\OffsetA+0)(AO)
+       lxvp    vs36, (\OffsetA+32)(AO)
+.endm
+
+.macro END1x8_NORMAL
+       END1x8 AO, BO, 64,8
+.endm
+
+.macro END1x8_WITHOUT_ADD
+       END1x8 AO, BO, 0, 0
+.endm
+
+.macro END1x8 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+       addi  \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+       addi  \AREG, \AREG, \OffsetA
+.endif
+       xvf32gerpp          0, 34, 33
+       xvf32gerpp          1, 34, 32
+       xvf32gerpp          2, 34, 37
+       xvf32gerpp          3, 34, 36
+.endm
+
+.macro LOAD1x8_2
+       LOAD1x8_2O 0, 0
+.endm
+
+.macro LOAD1x8_2O  OffsetA, OffsetB
+       lxv     vs34, (\OffsetB)(BO)
+       lxvp    vs32, (0+\OffsetA)(AO)
+       lxvp    vs36, (32+\OffsetA)(AO)
+       vspltisb        v10, 0
+       xxpermdi        vs35, vs34, vs42, 0
+       xxpermdi        vs34, vs34, vs42, 2
+       lxvp    vs38, (64+\OffsetA)(AO)
+       lxvp    vs40, (64+32+\OffsetA)(AO)
+.endm
+
+.macro END1x8_2
+  /*for load2 offset will be 128 and 16*/
+       KERNEL1x8_2  AO, BO, 128, 16, 0, 1, 1
+.endm
+
+.macro KERNEL1x8_E2 OffsetA, OffsetB, Index, IsLast
+       KERNEL1x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
+.endm
+
+.macro KERNEL1x8_L2 OffsetA, OffsetB, Index, IsLast
+       KERNEL1x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
+.endm
+
+.macro KERNEL1x8_2  AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+       xvf32gerpp          0, 34, 33
+       xvf32gerpp          1, 34, 32
+.if \Complete==0
+       lxvp    vs32, DISP16(\Index, 0+\OffsetA)(\AREG)
+.endif
+       xvf32gerpp          2, 34, 37
+       xvf32gerpp          3, 34, 36
+.if \Complete==0
+       lxvp    vs36, DISP16(\Index, 32+\OffsetA)(\AREG)
+.endif
+       xvf32gerpp          0, 35, 39
+       xvf32gerpp          1, 35, 38
+.if \Complete==0
+       lxvp    vs38, DISP16(\Index, 64+\OffsetA)(\AREG)
+.endif
+       xvf32gerpp          2, 35, 41
+       xvf32gerpp          3, 35, 40
+.if \Complete==0
+       lxv     vs34, DISP2(\Index, \OffsetB)(\BREG)
+       xxpermdi        vs35, vs34, vs42, 0
+       xxpermdi        vs34, vs34, vs42, 2
+       lxvp    vs40, DISP16(\Index, 64+32+\OffsetA)(\AREG)
+.endif
+.if \IsLast==1
+.if \Complete==1
+       addi    \BREG, \BREG, DISP2(\Index, \OffsetB)
+       addi    \AREG, \AREG, DISP16(\Index, \OffsetA)
+.else
+       addi    \BREG, \BREG, DISP2(\Index, 16)
+       addi    \AREG, \AREG, DISP16(\Index, 128)
+.endif
+.endif
+.endm
+
+.macro KERNEL1x8
+       LOAD1x8
+       END1x8  AO, BO, 64,8
+.endm
+
+.macro SAVE1x8
+       SHUFFLE_ACC     0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
+       SHUFFLE_ACC     1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45
+       SHUFFLE_ACC     2, vs8, vs9, vs10, vs11, vs34, vs42, vs38, vs46
+       SHUFFLE_ACC     3, vs12, vs13, vs14, vs15, vs35, vs43, vs39, vs47
+       xxpermdi        vs32, vs32, vs36, 0
+       xxpermdi        vs33, vs33, vs37, 0
+       xxpermdi        vs34, vs34, vs38, 0
+       xxpermdi        vs35, vs35, vs39, 0
+       xxpermdi        vs40, vs40, vs44, 0
+       xxperm vs40, vs40, permute_mask
+       xxpermdi        vs41, vs41, vs45, 0
+       xxperm vs41, vs41, permute_mask
+       xxpermdi        vs42, vs42, vs46, 0
+       xxperm vs42, vs42, permute_mask
+       xxpermdi        vs43, vs43, vs47, 0
+       xxperm vs43, vs43, permute_mask
+#ifndef TRMMKERNEL
+       lxvp    vs24, 0(CO)
+#endif
+       xxperm  vs0, vs32, permute_mask
+       xxperm  vs4, vs40, permute_mask
+#ifndef TRMMKERNEL
+       lxvp    vs26, 32(CO)
+#endif
+       xxperm  vs1, vs33, permute_mask
+       xxperm  vs5, vs41, permute_mask
+       xxperm  vs2, vs34, permute_mask
+       xxperm  vs6, vs42, permute_mask
+       xxperm  vs3, vs35, permute_mask
+       xxperm  vs7, vs43, permute_mask
+       AGGREGATE_REALS_IMAGES  vs32, vs0, vs40, vs4
+       AGGREGATE_REALS_IMAGES  vs33, vs1, vs41, vs5
+       AGGREGATE_REALS_IMAGES  vs34, vs2, vs42, vs6
+       AGGREGATE_REALS_IMAGES  vs35, vs3, vs43, vs7
+  /*inner reverse save_permute and store vs28 */
+       xxpermdi        vs28,save_permute_1,save_permute_1, 2
+  /*VSINRR, VSINII, VSOUT1, VSOUT2*/
+       MULT_APLHA_PART1    vs32, vs40, vs0, vs1
+       MULT_APLHA_PART1    vs33, vs41, vs2, vs3
+       MULT_APLHA_PART1    vs34, vs42, vs4, vs5
+       MULT_APLHA_PART1    vs35, vs43, vs6, vs7
+       MULT_APLHA_PART2    vs32, vs40, vs0, vs1
+       MULT_APLHA_PART2    vs33, vs41, vs2, vs3
+       MULT_APLHA_PART2    vs34, vs42, vs4, vs5
+       MULT_APLHA_PART2    vs35, vs43, vs6, vs7
+/* reconstruct r, i pairs*/
+       xxperm  vs0, vs1, vs28
+       xxperm  vs2, vs3, vs28
+       xxperm  vs4, vs5, vs28
+       xxperm  vs6, vs7, vs28
+#ifndef TRMMKERNEL
+  /* add */
+       xvaddsp vs24, vs24, vs2
+       xvaddsp vs25, vs25, vs0
+       xvaddsp vs26, vs26, vs6
+       xvaddsp vs27, vs27, vs4
+       stxvp   vs24, 0(CO)
+       stxvp   vs26, 32(CO)
+#else
+/* reconstruct r, i pairs*/
+       stxv    vs0, 0(CO)
+       stxv    vs2, 16(CO)
+       stxv    vs4, 32(CO)
+       stxv    vs6, 48(CO)
+#endif
+       addi  CO, CO, 64
+.endm
+
+/*                                             macros for N=1 and M=4
+**********************************************************************************************/
+
+.macro ZERO1x4
+       xxsetaccz       0
+       xxsetaccz       1
+.endm
+
+.macro LOAD1x4
+       LOAD1x4O 0, 0
+.endm
+
+.macro LOAD1x4O  OffsetA, OffsetB
+       lxsd    v2, (\OffsetB+0)(BO)
+       lxvp    vs32, (\OffsetA+0)(AO)
+.endm
+
+.macro END1x4_NORMAL
+       END1x4 AO, BO, 32,8
+.endm
+
+.macro END1x4_WITHOUT_ADD
+       END1x4 AO, BO, 0, 0
+.endm
+
+.macro END1x4 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+       addi  \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+       addi  \AREG, \AREG, \OffsetA
+.endif
+       xvf32gerpp          0, 34, 33
+       xvf32gerpp          1, 34, 32
+.endm
+
+.macro LOAD1x4_2
+       LOAD1x4_2O 0, 0
+.endm
+
+.macro LOAD1x4_2O  OffsetA, OffsetB
+       lxv     vs34, (\OffsetB)(BO)
+       lxvp    vs32, (0+\OffsetA)(AO)
+       vspltisb        v6, 0
+       xxpermdi        vs35, vs34, vs38, 0
+       xxpermdi        vs34, vs34, vs38, 2
+       lxvp    vs36, (32+\OffsetA)(AO)
+.endm
+
+.macro END1x4_2
+  /*for load2 offset will be 64 and 16*/
+       KERNEL1x4_2  AO, BO, 64, 16, 0, 1, 1
+.endm
+
+.macro KERNEL1x4_E2 OffsetA, OffsetB, Index, IsLast
+       KERNEL1x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
+.endm
+
+.macro KERNEL1x4_L2 OffsetA, OffsetB, Index, IsLast
+       KERNEL1x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
+.endm
+
+.macro KERNEL1x4_2  AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+       xvf32gerpp          0, 34, 33
+       xvf32gerpp          1, 34, 32
+.if \Complete==0
+       lxvp    vs32, DISP8(\Index, 0+\OffsetA)(\AREG)
+.endif
+       xvf32gerpp          0, 35, 37
+       xvf32gerpp          1, 35, 36
+.if \Complete==0
+       lxv     vs34, DISP2(\Index, \OffsetB)(\BREG)
+       xxpermdi        vs35, vs34, vs38, 0
+       xxpermdi        vs34, vs34, vs38, 2
+       lxvp    vs36, DISP8(\Index, 32+\OffsetA)(\AREG)
+.endif
+.if \IsLast==1
+.if \Complete==1
+       addi    \BREG, \BREG, DISP2(\Index, \OffsetB)
+       addi    \AREG, \AREG, DISP8(\Index, \OffsetA)
+.else
+       addi    \BREG, \BREG, DISP2(\Index, 16)
+       addi    \AREG, \AREG, DISP8(\Index, 64)
+.endif
+.endif
+.endm
+
+.macro KERNEL1x4
+       LOAD1x4
+       END1x4  AO, BO, 32,8
+.endm
+
+.macro SAVE1x4
+       SHUFFLE_ACC     0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
+       SHUFFLE_ACC     1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45
+       xxpermdi        vs32, vs32, vs36, 0
+       xxpermdi        vs40, vs40, vs44, 0
+       xxpermdi        vs33, vs33, vs37, 0
+       xxpermdi        vs41, vs41, vs45, 0
+       xxperm vs40, vs40, permute_mask
+       xxperm vs41, vs41, permute_mask
+#ifndef TRMMKERNEL
+       lxvp    vs24, 0(CO)
+#endif
+       xxperm  vs0, vs32, permute_mask
+       xxperm  vs4, vs40, permute_mask
+       xxperm  vs1, vs33, permute_mask
+       xxperm  vs5, vs41, permute_mask
+       AGGREGATE_REALS_IMAGES  vs32, vs0, vs40, vs4
+       AGGREGATE_REALS_IMAGES  vs33, vs1, vs41, vs5
+  /*inner reverse save_permute and store vs28 */
+       xxpermdi        vs28,save_permute_1,save_permute_1, 2
+  /*VSINRR, VSINII, VSOUT1, VSOUT2*/
+       MULT_APLHA_PART1    vs32, vs40, vs0, vs1
+       MULT_APLHA_PART1    vs33, vs41, vs2, vs3
+       MULT_APLHA_PART2    vs32, vs40, vs0, vs1
+       MULT_APLHA_PART2    vs33, vs41, vs2, vs3
+/* reconstruct r, i pairs*/
+       xxperm  vs0, vs1, vs28
+       xxperm  vs2, vs3, vs28
+#ifndef TRMMKERNEL
+  /* add */
+       xvaddsp vs24, vs24, vs2
+       xvaddsp vs25, vs25, vs0
+       stxvp   vs24, 0(CO)
+#else
+/* reconstruct r, i pairs*/
+       stxv    vs0, 0(CO)
+       stxv    vs2, 16(CO)
+#endif
+       addi  CO, CO, 32
+.endm
+
+/*                                             macros for N=1 and M=2
+**********************************************************************************************/
+
+.macro ZERO1x2
+       xxlxor  vs32, vs32, vs32
+       xxlxor  vs40, vs40, vs40
+.endm
+
+.macro LOAD1x2
+       LOAD1x2O 0, 0
+.endm
+
+.macro LOAD1x2O  OffsetA, OffsetB
+       lxsd    vs4, (\OffsetB+0)(BO)
+       lxv     vs0, (\OffsetA+0)(AO)
+       xxspltd   vs24, vs36, 0
+       xxperm    vs26, vs24, permute_mask
+.endm
+
+.macro END1x2_NORMAL
+       END1x2 AO, BO, 16,8
+.endm
+
+.macro END1x2_WITHOUT_ADD
+       END1x2 AO, BO, 0, 0
+.endm
+
+.macro END1x2 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+       addi  \BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+       addi  \AREG, \AREG, \OffsetA
+.endif
+       xvmaddasp       vs32, vs0, vs24
+       xvmaddasp       vs40, vs0, vs26
+.endm
+
+.macro LOAD1x2_2
+       LOAD1x2_2O 0, 0
+.endm
+
+.macro LOAD1x2_2O  OffsetA, OffsetB
+       lxv     vs27, (\OffsetB)(BO)
+       lxvp    vs4, (0+\OffsetA)(AO)
+       xxspltd  vs8, vs27, 1
+       xxspltd  vs24, vs27, 0
+       xxperm    vs10, vs8, permute_mask
+       xxperm    vs26, vs24, permute_mask
+.endm
+
+.macro END1x2_2
+  /*for load2 offset will be 32 and 16*/
+       KERNEL1x2_2  AO, BO, 32, 16, 0, 1, 1
+.endm
+
+.macro KERNEL1x2_E2 OffsetA, OffsetB, Index, IsLast
+       KERNEL1x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
+.endm
+
+.macro KERNEL1x2_L2 OffsetA, OffsetB, Index, IsLast
+       KERNEL1x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
+.endm
+
+.macro KERNEL1x2_2  AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+.if \Complete==0
+       lxv     vs27, DISP2(\Index, \OffsetB)(\BREG)
+.endif
+       xvmaddasp       vs32, vs5, vs8
+       xvmaddasp       vs40, vs5, vs10
+
+.if \Complete==0
+       xxspltd  vs8, vs27, 1
+       xxperm    vs10, vs8, permute_mask
+.endif
+       xvmaddasp       vs32, vs4, vs24
+       xvmaddasp       vs40, vs4, vs26
+.if \Complete==0
+       lxvp    vs4, DISP4(\Index, 0+\OffsetA)(\AREG)
+.endif
+
+.if \Complete==0
+       xxspltd  vs24, vs27, 0
+       xxperm    vs26, vs24, permute_mask
+.endif
+.if \IsLast==1
+.if \Complete==1
+       addi    \BREG, \BREG, DISP2(\Index, \OffsetB)
+       addi    \AREG, \AREG, DISP4(\Index, \OffsetA)
+.else
+       addi    \BREG, \BREG, DISP2(\Index, 16)
+       addi    \AREG, \AREG, DISP4(\Index, 32)
+.endif
+.endif
+.endm
+
+.macro KERNEL1x2
+       LOAD1x2
+       END1x2  AO, BO, 16,8
+.endm
+
+.macro SAVE1x2
+#ifndef TRMMKERNEL
+       lxv     vs24, 0(CO)
+#endif
+       xxperm  vs0, vs32, permute_mask
+       xxperm  vs4, vs40, permute_mask
+       AGGREGATE_REALS_IMAGES  vs32, vs0, vs40, vs4
+  /*inner reverse save_permute and store vs28 */
+       xxpermdi        vs28,save_permute_1,save_permute_1, 2
+  /*VSINRR, VSINII, VSOUT1, VSOUT2*/
+       MULT_APLHA_PART1    vs32, vs40, vs0, vs1
+       MULT_APLHA_PART2    vs32, vs40, vs0, vs1
+/* reconstruct r, i pairs*/
+       xxperm  vs0, vs1, vs28
+#ifndef TRMMKERNEL
+  /* add */
+       xvaddsp vs24, vs24, vs0
+       stxv    vs24, 0(CO)
+#else
+/* reconstruct r, i pairs*/
+       stxv    vs0, 0(CO)
+#endif
+       addi  CO, CO, 16
+.endm
+
+/*                                             macros for N=1 and M=1
+**********************************************************************************************/
+.macro ZERO1x1
+       xxlxor  vs32, vs32, vs32
+       xxlxor  vs40, vs40, vs40
+.endm
+
+.macro LOAD1x1
+       LOAD1x1O 0, 0
+.endm
+
+.macro LOAD1x1O  OffsetA, OffsetB
+       lxsd    v4, (\OffsetB+0)(BO)
+       lxsd    v5, (\OffsetA+0)(AO)
+       xxperm    vs38, vs36, permute_mask
+.endm
+
+.macro END1x1_NORMAL
+       END1x1 AO, BO,8,8
+.endm
+
+.macro END1x1_WITHOUT_ADD
+       END1x1 AO, BO, 0, 0
+.endm
+
+.macro END1x1 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+       addi  \BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+       addi  \AREG, \AREG, \OffsetA
+.endif
+       xvmaddasp       vs32, vs37, vs36
+       xvmaddasp       vs40, vs37, vs38
+.endm
+
+.macro LOAD1x1_2
+       LOAD1x1_2O 0, 0
+.endm
+
+.macro LOAD1x1_2O  OffsetA, OffsetB
+       lxv     vs8, (\OffsetB)(BO)
+       lxv     vs4, (0+\OffsetA)(AO)
+       xxperm    vs10, vs8, permute_mask
+.endm
+
+.macro END1x1_2
+  /*for load2 offset will be 16 and 16*/
+       KERNEL1x1_2  AO, BO, 16, 16, 0, 1, 1
+.endm
+
+.macro KERNEL1x1_E2 OffsetA, OffsetB, Index, IsLast
+       KERNEL1x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
+.endm
+
+.macro KERNEL1x1_L2 OffsetA, OffsetB, Index, IsLast
+       KERNEL1x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
+.endm
+
+.macro KERNEL1x1_2  AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+       xvmaddasp       vs32, vs4, vs8
+       xvmaddasp       vs40, vs4, vs10
+.if \Complete==0
+       lxv     vs8, DISP2(\Index, \OffsetB)(\BREG)
+       lxv     vs4, DISP2(\Index, \OffsetB)(\AREG)
+       xxperm    vs10, vs8, permute_mask
+.endif
+.if \IsLast==1
+.if \Complete==1
+       addi    \BREG, \BREG, DISP2(\Index, \OffsetB)
+       addi    \AREG, \AREG, DISP2(\Index, \OffsetA)
+.else
+       addi    \BREG, \BREG, DISP2(\Index, 16)
+       addi    \AREG, \AREG, DISP2(\Index, 16)
+.endif
+.endif
+.endm
+
+.macro KERNEL1x1
+       LOAD1x1
+       END1x1  AO, BO, 8,8
+.endm
+
+.macro SAVE1x1
+#ifndef TRMMKERNEL
+       lxsd    v4, 0(CO)
+#endif
+  /*aggregate x2*/
+       xxpermdi        vs33, vs32, vs32, 2
+       xxpermdi        vs41, vs40, vs40, 2
+       xvaddsp vs32, vs32, vs33
+       xvaddsp vs40, vs40, vs41
+
+       xxperm  vs0, vs32, permute_mask
+       xxperm  vs4, vs40, permute_mask
+       AGGREGATE_REALS_IMAGES  vs32, vs0, vs40, vs4
+  /*inner reverse save_permute and store vs28 */
+       xxpermdi        vs28,save_permute_1,save_permute_1, 2
+  /*VSINRR, VSINII, VSOUT1, VSOUT2*/
+       MULT_APLHA_PART1    vs32, vs40, vs37, vs1
+       MULT_APLHA_PART2    vs32, vs40, vs37, vs1
+/* reconstruct r, i pairs*/
+       xxperm  vs37, vs1, vs28
+#ifndef TRMMKERNEL
+  /* add */
+       xvaddsp vs36, vs36, vs37
+       stxsd   v4, 0(CO)
+#else
+/* vs37 is v5 */
+       stxsd   v5, 0(CO)
+#endif
+       addi  CO, CO, 8
+.endm
+
+/****************************TRMM POINTER REFRESH MACROSES*************************/
+.macro SHIFT_REG       REG1,REG2,SHIFT_VAL
+.if \SHIFT_VAL==16
+       slwi            \REG1, \REG2, 7
+.elseif \SHIFT_VAL==8
+       slwi            \REG1, \REG2, 6
+.elseif \SHIFT_VAL==4
+       slwi            \REG1, \REG2, 5
+.elseif \SHIFT_VAL==2
+       slwi            \REG1, \REG2, 4
+.elseif \SHIFT_VAL==1
+       slwi            \REG1, \REG2, 3
+.endif
+.endm
+
+/*
+//#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+//             ptrbb = bb;
+// #else
+//             ptrba += off*8;
+//             ptrbb = bb + off*4;
+// #endif
+*/
+.macro REFRESH_POINTERS  PTR_A,PTR_B, OFF_VAL, B_VAL, C_A, C_B
+#if (defined(LEFT) &&  defined(TRANSA)) ||  (!defined(LEFT) && !defined(TRANSA))
+/* ptrbb = bb;*/
+       mr \PTR_B, \B_VAL     /* refresh BPOINT */
+#else
+/*
+// ptrba  =ptrba+ off*C_A;
+// ptrbb = bb + off*C_B;
+*/
+       SHIFT_REG T4, \OFF_VAL, \C_B    /* Number of values in B shifted  */
+       SHIFT_REG T2, \OFF_VAL, \C_A    /* Number of values in A shifted  */
+       add     \PTR_B, \B_VAL, T4      /* Add values to BO */
+       add     \PTR_A, \PTR_A, T2      /* Add values to AO  */
+#endif
+.endm
+
+/*
+// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+//             temp = bk-off;
+// #elif defined(LEFT)
+//             temp = off+8;   // number of values in A
+// #else
+//             temp = off+4;   // number of values in B
+// #endif
+*/
+.macro REFRESH_TEMP_BK TEMP_BK, BK_VAL, OFF_VAL, INCR_A, INCR_B
+    #if (defined(LEFT) && !defined(TRANSA)) ||  (!defined(LEFT) && defined(TRANSA))
+       /* temp = bk-off;*/
+       sub \TEMP_BK, \BK_VAL, \OFF_VAL
+    #elif defined(LEFT)
+       /* temp = off+INCR_A;   // number of values in A */
+       addi \TEMP_BK, \OFF_VAL, \INCR_A
+    #else
+       /* temp = off+INCR_B    // number of values in B*/
+       addi \TEMP_BK, \OFF_VAL, \INCR_B
+    #endif
+.endm
+/*
+// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+//             temp = bk - off;
+// #ifdef LEFT
+//             temp -= 8; // number of values in A
+// #else
+//             temp -= 4; // number of values in B
+// #endif
+//             ptrba += temp*8;
+//             ptrbb += temp*4;
+// #endif
+
+// #ifdef LEFT
+//             off += 8; // number of values in A
+// #endif
+*/
+.macro REFRESH_AFTER_SAVE TEMP_BK, BK_VAL, OFF_VAL,PTR_B,PTR_A, C_A, C_B
+    #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       /*temp = bk - off;*/
+       sub \TEMP_BK, \BK_VAL, \OFF_VAL
+    #ifdef LEFT
+       /*temp -= 8; // number of values in A*/
+       addi \TEMP_BK, \TEMP_BK,-\C_A
+    #else
+       /*temp -= 4; // number of values in B*/
+       addi \TEMP_BK, \TEMP_BK,-\C_B
+    #endif
+       /*ptrba += temp*C_A;
+       ptrbb += temp*C_B;*/
+       SHIFT_REG T4, \TEMP_BK, \C_A
+       SHIFT_REG T2, \TEMP_BK, \C_B
+       add \PTR_A, \PTR_A, T4/*ptrba+temp*C_A*/
+       add \PTR_B, \PTR_B, T2
+    #endif
+    #ifdef LEFT
+       /*off += 8; // number of values in A*/
+       addi \OFF_VAL, \OFF_VAL, \C_A
+    #endif
+.endm
diff --git a/kernel/power/dgemm_kernel_power10.c b/kernel/power/dgemm_kernel_power10.c

new file mode 100644 (file)

index 0000000..b3ee301
--- /dev/null
+++ b/kernel/power/dgemm_kernel_power10.c
@@ -0,0 +1,864 @@
+/*********************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+#include "common.h"
+#include <altivec.h>
+
+typedef unsigned char vec_t __attribute__ ((vector_size (16)));
+typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
+typedef FLOAT v2sf_t __attribute__ ((vector_size (8)));
+
+#ifdef TRMMKERNEL
+#define SAVE_ACC(ACC, J)  \
+          __builtin_mma_disassemble_acc (result, ACC); \
+          rowC = (v4sf_t *) &CO[0* ldc+J]; \
+          rowC[0] = result[3] * alpha; \
+          rowC = (v4sf_t *) &CO[1*ldc+J]; \
+          rowC[0] = result[2] * alpha; \
+          rowC = (v4sf_t *) &CO[2*ldc+J]; \
+          rowC[0] = result[1] * alpha; \
+          rowC = (v4sf_t *) &CO[3*ldc+J]; \
+          rowC[0] = result[0] * alpha;
+#define SAVE_ACC1(ACC, J)  \
+          __builtin_mma_disassemble_acc (result, ACC); \
+          rowC = (v4sf_t *) &CO[4* ldc+J]; \
+          rowC[0] = result[3] * alpha; \
+          rowC = (v4sf_t *) &CO[5*ldc+J]; \
+          rowC[0] = result[2] * alpha; \
+          rowC = (v4sf_t *) &CO[6*ldc+J]; \
+          rowC[0] = result[1] * alpha; \
+          rowC = (v4sf_t *) &CO[7*ldc+J]; \
+          rowC[0] = result[0] * alpha;
+#define  SAVE2x4_ACC(ACC, J)  \
+         __builtin_mma_disassemble_acc (result, ACC); \
+         rowC = (v4sf_t *) &CO[0* ldc+J]; \
+          rowC[0] = result[3] * alpha; \
+         rowC = (v4sf_t *) &CO[1* ldc+J]; \
+          rowC[0] = result[2] * alpha;
+#else
+#define SAVE_ACC(ACC, J)  \
+          __builtin_mma_disassemble_acc (result, ACC); \
+          rowC = (v4sf_t *) &CO[0* ldc+J]; \
+          rowC[0] += result[3] * alpha; \
+          rowC = (v4sf_t *) &CO[1*ldc+J]; \
+          rowC[0] += result[2] * alpha; \
+          rowC = (v4sf_t *) &CO[2*ldc+J]; \
+          rowC[0] += result[1] * alpha; \
+          rowC = (v4sf_t *) &CO[3*ldc+J]; \
+          rowC[0] += result[0] * alpha;
+#define SAVE_ACC1(ACC, J)  \
+          __builtin_mma_disassemble_acc (result, ACC); \
+          rowC = (v4sf_t *) &CO[4* ldc+J]; \
+          rowC[0] += result[3] * alpha; \
+          rowC = (v4sf_t *) &CO[5*ldc+J]; \
+          rowC[0] += result[2] * alpha; \
+          rowC = (v4sf_t *) &CO[6*ldc+J]; \
+          rowC[0] += result[1] * alpha; \
+          rowC = (v4sf_t *) &CO[7*ldc+J]; \
+          rowC[0] += result[0] * alpha;
+#define  SAVE2x4_ACC(ACC, J)  \
+         __builtin_mma_disassemble_acc (result, ACC); \
+         rowC = (v4sf_t *) &CO[0* ldc+J]; \
+          rowC[0] += result[3] * alpha; \
+         rowC = (v4sf_t *) &CO[1* ldc+J]; \
+          rowC[0] += result[2] * alpha;
+#endif
+
+#define SET_ACC_ZERO4() \
+          __builtin_mma_xxsetaccz (&acc0); \
+          __builtin_mma_xxsetaccz (&acc1); \
+          __builtin_mma_xxsetaccz (&acc2); \
+          __builtin_mma_xxsetaccz (&acc3);
+
+#define SET_ACC_ZERO8() \
+          __builtin_mma_xxsetaccz (&acc0); \
+          __builtin_mma_xxsetaccz (&acc1); \
+          __builtin_mma_xxsetaccz (&acc2); \
+          __builtin_mma_xxsetaccz (&acc3); \
+          __builtin_mma_xxsetaccz (&acc4); \
+          __builtin_mma_xxsetaccz (&acc5); \
+          __builtin_mma_xxsetaccz (&acc6); \
+          __builtin_mma_xxsetaccz (&acc7);
+
+#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+#define REFRESH_TEMP_BK(x, y) \
+            temp = k - off;
+#elif defined(LEFT)
+#define REFRESH_TEMP_BK(x, y) \
+            temp = off + x;
+#else
+#define REFRESH_TEMP_BK(x, y) \
+            temp = off + y;
+#endif
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+#define REFRESH_POINTERS(x, y) \
+          BO = B; \
+          REFRESH_TEMP_BK(x, y)
+#else
+#define REFRESH_POINTERS(x, y) \
+          AO += off * x; \
+          BO = B + off * y; \
+          REFRESH_TEMP_BK(x, y)
+#endif
+
+#ifdef LEFT
+#define REFRESH_OFF(x) \
+            off += x;
+#else
+#define REFRESH_OFF(x)
+#endif
+
+#ifdef LEFT
+#define UPDATE_TEMP(x, y) \
+            temp -= x;
+#else
+#define UPDATE_TEMP(x, y) \
+            temp -= y;
+#endif
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+#define REFRESH_TMP_AFTER_SAVE(x, y) \
+            temp = k - off; \
+            UPDATE_TEMP(x, y) \
+            AO += temp * x; \
+            BO += temp * y;
+#else
+#define REFRESH_TMP_AFTER_SAVE(x, y)
+#endif
+
+#define REFRESH_AFTER_SAVE(x,y) \
+        REFRESH_TMP_AFTER_SAVE(x, y) \
+        REFRESH_OFF(x)
+/*************************************************************************************
+* GEMM Kernel
+*************************************************************************************/
+int
+CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
+       FLOAT * C, BLASLONG ldc
+#ifdef TRMMKERNEL
+       , BLASLONG offset
+#endif
+  )
+{
+  BLASLONG N = n;
+  BLASLONG i1;
+#if defined(TRMMKERNEL)
+  BLASLONG off;
+#endif
+#if defined(TRMMKERNEL) && !defined(LEFT)
+  off = -offset;
+#endif
+  v4sf_t valpha = { alpha, alpha };
+  N = n >> 2;
+  for (i1 = 0; i1 < N; i1++)
+    {
+      BLASLONG i, j, temp;
+      FLOAT *CO;
+      FLOAT *AO;
+#if defined(TRMMKERNEL) && defined(LEFT)
+      off = offset;
+#endif
+      CO = C;
+      C += ldc << 2;
+      AO = A;
+      PREFETCH1 (A, 128);
+      PREFETCH1 (A, 256);
+      i = m >> 4;
+      for (j = 0; j < i; j++)
+       {
+          FLOAT *BO;
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (16, 4);
+#else
+          BO = B;
+          temp = k;
+#endif
+         v4sf_t *rowC;
+         v4sf_t result[4];
+         BLASLONG l = 0;
+         PREFETCH1 (CO, 0);
+         PREFETCH1 (CO + ldc, 0);
+         PREFETCH1 (CO + ldc + ldc, 0);
+         PREFETCH1 (CO + ldc + ldc + ldc, 0);
+         PREFETCH1 (CO, 128);
+         PREFETCH1 (CO + ldc, 128);
+         PREFETCH1 (CO + ldc + ldc, 128);
+         PREFETCH1 (CO + ldc + ldc + ldc, 128);
+         __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
+         SET_ACC_ZERO8 ();
+         for (l = 0; l < temp; l++)
+           {
+             vec_t *rowA = (vec_t *) & AO[l << 4];
+             __vector_pair rowB;
+             vec_t *rb = (vec_t *) & BO[l << 2];
+             __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+             __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
+             __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
+             __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
+             __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]);
+             __builtin_mma_xvf64gerpp (&acc4, rowB, rowA[4]);
+             __builtin_mma_xvf64gerpp (&acc5, rowB, rowA[5]);
+             __builtin_mma_xvf64gerpp (&acc6, rowB, rowA[6]);
+             __builtin_mma_xvf64gerpp (&acc7, rowB, rowA[7]);
+           }
+         SAVE_ACC (&acc0, 0);
+         SAVE_ACC (&acc2, 4);
+         SAVE_ACC (&acc1, 2);
+         SAVE_ACC (&acc3, 6);
+         SAVE_ACC (&acc4, 8);
+         SAVE_ACC (&acc6, 12);
+         SAVE_ACC (&acc5, 10);
+         SAVE_ACC (&acc7, 14);
+         AO += temp << 4;
+         BO += temp << 2;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (16, 4)
+#endif
+         CO += 16;
+       }
+      i = (m & 15) >> 3;
+      for (j = 0; j < i; j++)
+       {
+         FLOAT *BO;
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (8, 4);
+#else
+          BO = B;
+          temp = k;
+#endif
+         v4sf_t *rowC;
+         v4sf_t result[4];
+         __vector_quad acc0, acc1, acc2, acc3;
+         SET_ACC_ZERO4 ();
+         BLASLONG l = 0;
+         for (l = 0; l < temp; l++)
+           {
+             vec_t *rowA = (vec_t *) & AO[l << 3];
+             __vector_pair rowB;
+             vec_t *rb = (vec_t *) & BO[l << 2];
+             __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+             __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
+             __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
+             __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
+             __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]);
+           }
+         SAVE_ACC (&acc0, 0);
+         SAVE_ACC (&acc2, 4);
+         SAVE_ACC (&acc1, 2);
+         SAVE_ACC (&acc3, 6);
+         CO += 8;
+         AO += temp << 3;
+         BO += temp << 2;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (8, 4)
+#endif
+       }
+      i = (m & 7) >> 2;
+      for (j = 0; j < i; j++)
+       {
+         FLOAT *BO;
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (4, 4);
+#else
+          BO = B;
+          temp = k;
+#endif
+         v4sf_t *rowC;
+         v4sf_t result[4];
+         __vector_quad acc0, acc1;
+         __builtin_mma_xxsetaccz (&acc0);
+         __builtin_mma_xxsetaccz (&acc1);
+         BLASLONG l = 0;
+         for (l = 0; l < temp; l++)
+           {
+             vec_t *rowA = (vec_t *) & AO[l << 2];
+             __vector_pair rowB;
+             vec_t *rb = (vec_t *) & BO[l << 2];
+             __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+             __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
+             __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
+           }
+         SAVE_ACC (&acc0, 0);
+         SAVE_ACC (&acc1, 2);
+         CO += 4;
+         AO += temp << 2;
+         BO += temp << 2;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (4, 4)
+#endif
+       }
+      i = (m & 3) >> 1;
+      for (j = 0; j < i; j++)
+       {
+         FLOAT *BO;
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (2, 4);
+#else
+          BO = B;
+          temp = k;
+#endif
+         v4sf_t *rowC;
+         v4sf_t result[4];
+         __vector_quad acc0;
+         __builtin_mma_xxsetaccz (&acc0);
+         BLASLONG l = 0;
+         for (l = 0; l < temp; l++)
+           {
+             vec_t *rowA = (vec_t *) & AO[l << 1];
+             __vector_pair rowB;
+             vec_t *rb = (vec_t *) & BO[l << 2];
+             __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+             __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
+           }
+         SAVE_ACC (&acc0, 0);
+         CO += 2;
+         AO += temp << 1;
+         BO += temp << 2;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (2, 4)
+#endif
+       }
+      i = (m & 1) >> 0;
+      for (j = 0; j < i; j++)
+       {
+         FLOAT *BO;
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (1, 4);
+#else
+          BO = B;
+          temp = k;
+#endif
+         BLASLONG l = 0;
+         v4sf_t t = { 0, 0 };
+         v4sf_t t1 = { 0, 0 };
+         for (l = 0; l < temp; l++)
+           {
+             v4sf_t rowA = { AO[l], AO[l] };
+             v4sf_t rowB = { BO[l << 2], BO[(l << 2) + 1] };
+             v4sf_t rowB1 = { BO[(l << 2) + 2], BO[(l << 2) + 3] };
+             t += rowA * rowB;
+             t1 += rowA * rowB1;
+           }
+         t = t * valpha;
+         t1 = t1 * valpha;
+#if defined(TRMMKERNEL)
+         CO[0 * ldc] = t[0];
+         CO[1 * ldc] = t[1];
+         CO[2 * ldc] = t1[0];
+         CO[3 * ldc] = t1[1];
+#else
+         CO[0 * ldc] += t[0];
+         CO[1 * ldc] += t[1];
+         CO[2 * ldc] += t1[0];
+         CO[3 * ldc] += t1[1];
+#endif
+         CO += 1;
+         AO += temp;
+         BO += temp << 2;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (1, 4)
+#endif
+       }
+#if defined(TRMMKERNEL) && !defined(LEFT)
+      off += 4;                 // number of values in A
+#endif
+      B += k << 2;
+    }
+  N = (n & 3) >> 1;
+  for (i1 = 0; i1 < N; i1++)
+    {
+      BLASLONG i, j, temp;
+#if defined(TRMMKERNEL) && defined(LEFT)
+      off = offset;
+#endif
+      FLOAT *CO;
+      FLOAT *AO;
+      CO = C;
+      C += ldc << 1;
+      AO = A;
+      i = m >> 4;
+      for (j = 0; j < i; j++)
+       {
+         FLOAT *BO;
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (16, 2);
+#else
+          BO = B;
+          temp = k;
+#endif
+         v4sf_t *rowC;
+         v4sf_t result[4];
+         __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
+         SET_ACC_ZERO8 ();
+         BLASLONG l = 0;
+         for (l = 0; l < temp; l++)
+           {
+             FLOAT t[4] = { 0, 0, 0, 0 };
+             t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
+             __vector_pair rowB;
+             vec_t *rb = (vec_t *) & t[0];
+             __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+             vec_t *rowA = (vec_t *) & AO[l << 4];
+             __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
+             __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
+             __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
+             __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]);
+             __builtin_mma_xvf64gerpp (&acc4, rowB, rowA[4]);
+             __builtin_mma_xvf64gerpp (&acc5, rowB, rowA[5]);
+             __builtin_mma_xvf64gerpp (&acc6, rowB, rowA[6]);
+             __builtin_mma_xvf64gerpp (&acc7, rowB, rowA[7]);
+           }
+         SAVE2x4_ACC (&acc0, 0);
+         SAVE2x4_ACC (&acc1, 2);
+         SAVE2x4_ACC (&acc2, 4);
+         SAVE2x4_ACC (&acc3, 6);
+         SAVE2x4_ACC (&acc4, 8);
+         SAVE2x4_ACC (&acc5, 10);
+         SAVE2x4_ACC (&acc6, 12);
+         SAVE2x4_ACC (&acc7, 14);
+         CO += 16;
+         AO += temp << 4;
+         BO += temp << 1;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (16, 2)
+#endif
+       }
+      i = (m & 15) >> 3;
+      for (j = 0; j < i; j++)
+       {
+         FLOAT *BO;
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (8, 2);
+#else
+          BO = B;
+          temp = k;
+#endif
+         v4sf_t *rowC;
+         v4sf_t result[4];
+         __vector_quad acc0, acc1, acc2, acc3;
+         SET_ACC_ZERO4 ();
+         BLASLONG l = 0;
+         for (l = 0; l < temp; l++)
+           {
+             FLOAT t[4] = { 0, 0, 0, 0 };
+             t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
+             __vector_pair rowB;
+             vec_t *rb = (vec_t *) & t[0];
+             __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+             vec_t *rowA = (vec_t *) & AO[l << 3];
+             __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
+             __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
+             __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
+             __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]);
+           }
+         SAVE2x4_ACC (&acc0, 0);
+         SAVE2x4_ACC (&acc1, 2);
+         SAVE2x4_ACC (&acc2, 4);
+         SAVE2x4_ACC (&acc3, 6);
+         CO += 8;
+         AO += temp << 3;
+         BO += temp << 1;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (8, 2)
+#endif
+       }
+      i = (m & 7) >> 2;
+      for (j = 0; j < i; j++)
+       {
+         FLOAT *BO;
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (4, 2);
+#else
+          BO = B;
+          temp = k;
+#endif
+         v4sf_t *rowC;
+         v4sf_t result[4];
+         __vector_quad acc0, acc1;
+         __builtin_mma_xxsetaccz (&acc0);
+         __builtin_mma_xxsetaccz (&acc1);
+         BLASLONG l = 0;
+         for (l = 0; l < temp; l++)
+           {
+             FLOAT t[4] = { 0, 0, 0, 0 };
+             t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
+             __vector_pair rowB;
+             vec_t *rb = (vec_t *) & t[0];
+             __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+             vec_t *rowA = (vec_t *) & AO[l << 2];
+             __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
+             __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
+           }
+         SAVE2x4_ACC (&acc0, 0);
+         SAVE2x4_ACC (&acc1, 2);
+         CO += 4;
+         AO += temp << 2;
+         BO += temp << 1;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (4, 2)
+#endif
+       }
+      i = (m & 3) >> 1;
+      for (j = 0; j < i; j++)
+       {
+         FLOAT *BO;
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (2, 2);
+#else
+          BO = B;
+          temp = k;
+#endif
+         v4sf_t *rowC;
+         v4sf_t result[4];
+         __vector_quad acc0;
+         __builtin_mma_xxsetaccz (&acc0);
+         BLASLONG l = 0;
+         for (l = 0; l < temp; l++)
+           {
+             FLOAT t[4] = { 0, 0, 0, 0 };
+             t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
+             __vector_pair rowB;
+             vec_t *rb = (vec_t *) & t[0];
+             __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+             vec_t *rowA = (vec_t *) & AO[l << 1];
+             __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
+           }
+         SAVE2x4_ACC (&acc0, 0);
+         CO += 2;
+         AO += temp << 1;
+         BO += temp << 1;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (2, 2)
+#endif
+       }
+      i = (m & 1) >> 0;
+      for (j = 0; j < i; j++)
+       {
+         FLOAT *BO;
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (1, 2);
+#else
+          BO = B;
+          temp = k;
+#endif
+         BLASLONG l = 0;
+         v4sf_t t = { 0, 0 };
+         for (l = 0; l < temp; l++)
+           {
+             v4sf_t rowA = { AO[l], AO[l] };
+             v4sf_t rowB = { BO[l << 1], BO[(l << 1) + 1] };
+             t += rowA * rowB;
+           }
+         t = t * valpha;
+#if defined(TRMMKERNEL)
+         CO[0 * ldc] = t[0];
+         CO[1 * ldc] = t[1];
+#else
+         CO[0 * ldc] += t[0];
+         CO[1 * ldc] += t[1];
+#endif
+         CO += 1;
+         AO += temp;
+         BO += temp << 1;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (1, 2)
+#endif
+       }
+#if defined(TRMMKERNEL) && !defined(LEFT)
+      off += 2;                 // number of values in A
+#endif
+      B += k << 1;
+    }
+  N = (n & 1) >> 0;
+  for (i1 = 0; i1 < N; i1++)
+    {
+      BLASLONG i, temp;
+#if defined(TRMMKERNEL) && defined(LEFT)
+      off = offset;
+#endif
+      FLOAT *CO;
+      FLOAT *AO;
+      CO = C;
+      C += ldc;
+      AO = A;
+      i = m;
+      while (i >= 16)
+       {
+         FLOAT *BO;
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (16, 1)
+#else
+          BO = B;
+          temp = k;
+#endif
+         BLASLONG l = 0;
+         v4sf_t t = { 0, 0 };
+         v4sf_t t1 = { 0, 0 };
+         v4sf_t t2 = { 0, 0 };
+         v4sf_t t3 = { 0, 0 };
+         v4sf_t t4 = { 0, 0 };
+         v4sf_t t5 = { 0, 0 };
+         v4sf_t t6 = { 0, 0 };
+         v4sf_t t7 = { 0, 0 };
+         for (l = 0; l < temp; l++)
+           {
+             v4sf_t rowB = { BO[l], BO[l] };
+             v4sf_t rowA = { AO[l << 4], AO[(l << 4) + 1] };
+             v4sf_t rowA1 = { AO[(l << 4) + 2], AO[(l << 4) + 3] };
+             v4sf_t rowA2 = { AO[(l << 4) + 4], AO[(l << 4) + 5] };
+             v4sf_t rowA3 = { AO[(l << 4) + 6], AO[(l << 4) + 7] };
+             v4sf_t rowA4 = { AO[(l << 4) + 8], AO[(l << 4) + 9] };
+             v4sf_t rowA5 = { AO[(l << 4) + 10], AO[(l << 4) + 11] };
+             v4sf_t rowA6 = { AO[(l << 4) + 12], AO[(l << 4) + 13] };
+             v4sf_t rowA7 = { AO[(l << 4) + 14], AO[(l << 4) + 15] };
+             t += rowA * rowB;
+             t1 += rowA1 * rowB;
+             t2 += rowA2 * rowB;
+             t3 += rowA3 * rowB;
+             t4 += rowA4 * rowB;
+             t5 += rowA5 * rowB;
+             t6 += rowA6 * rowB;
+             t7 += rowA7 * rowB;
+           }
+         t = t * valpha;
+         t1 = t1 * valpha;
+         t2 = t2 * valpha;
+         t3 = t3 * valpha;
+         t4 = t4 * valpha;
+         t5 = t5 * valpha;
+         t6 = t6 * valpha;
+         t7 = t7 * valpha;
+#if defined(TRMMKERNEL)
+         CO[0] = t[0];
+         CO[1] = t[1];
+         CO[2] = t1[0];
+         CO[3] = t1[1];
+         CO[4] = t2[0];
+         CO[5] = t2[1];
+         CO[6] = t3[0];
+         CO[7] = t3[1];
+         CO[8] = t4[0];
+         CO[9] = t4[1];
+         CO[10] = t5[0];
+         CO[11] = t5[1];
+         CO[12] = t6[0];
+         CO[13] = t6[1];
+         CO[14] = t7[0];
+         CO[15] = t7[1];
+#else
+         CO[0] += t[0];
+         CO[1] += t[1];
+         CO[2] += t1[0];
+         CO[3] += t1[1];
+         CO[4] += t2[0];
+         CO[5] += t2[1];
+         CO[6] += t3[0];
+         CO[7] += t3[1];
+         CO[8] += t4[0];
+         CO[9] += t4[1];
+         CO[10] += t5[0];
+         CO[11] += t5[1];
+         CO[12] += t6[0];
+         CO[13] += t6[1];
+         CO[14] += t7[0];
+         CO[15] += t7[1];
+#endif
+         AO += temp << 4;
+         BO += temp;
+         CO += 16;
+         i -= 16;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (16, 1)
+#endif
+       }
+      while (i >= 8)
+       {
+         FLOAT *BO;
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (8, 1)
+#else
+          BO = B;
+          temp = k;
+#endif
+         BLASLONG l = 0;
+         v4sf_t t = { 0, 0 };
+         v4sf_t t1 = { 0, 0 };
+         v4sf_t t2 = { 0, 0 };
+         v4sf_t t3 = { 0, 0 };
+         for (l = 0; l < temp; l++)
+           {
+             v4sf_t rowB = { BO[l], BO[l] };
+             v4sf_t rowA = { AO[l << 3], AO[(l << 3) + 1] };
+             v4sf_t rowA1 = { AO[(l << 3) + 2], AO[(l << 3) + 3] };
+             v4sf_t rowA2 = { AO[(l << 3) + 4], AO[(l << 3) + 5] };
+             v4sf_t rowA3 = { AO[(l << 3) + 6], AO[(l << 3) + 7] };
+             t += rowA * rowB;
+             t1 += rowA1 * rowB;
+             t2 += rowA2 * rowB;
+             t3 += rowA3 * rowB;
+           }
+         t = t * valpha;
+         t1 = t1 * valpha;
+         t2 = t2 * valpha;
+         t3 = t3 * valpha;
+#if defined(TRMMKERNEL)
+         CO[0] = t[0];
+         CO[1] = t[1];
+         CO[2] = t1[0];
+         CO[3] = t1[1];
+         CO[4] = t2[0];
+         CO[5] = t2[1];
+         CO[6] = t3[0];
+         CO[7] = t3[1];
+#else
+         CO[0] += t[0];
+         CO[1] += t[1];
+         CO[2] += t1[0];
+         CO[3] += t1[1];
+         CO[4] += t2[0];
+         CO[5] += t2[1];
+         CO[6] += t3[0];
+         CO[7] += t3[1];
+#endif
+         AO += temp << 3;
+         BO += temp;
+         CO += 8;
+         i -= 8;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (8, 1)
+#endif
+       }
+      while (i >= 4)
+       {
+         FLOAT *BO;
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (4, 1)
+#else
+          BO = B;
+          temp = k;
+#endif
+         BLASLONG l = 0;
+         v4sf_t t = { 0, 0 };
+         v4sf_t t1 = { 0, 0 };
+         for (l = 0; l < temp; l++)
+           {
+             v4sf_t rowB = { BO[l], BO[l] };
+             v4sf_t rowA = { AO[l << 2], AO[(l << 2) + 1] };
+             v4sf_t rowA1 = { AO[(l << 2) + 2], AO[(l << 2) + 3] };
+             t += rowA * rowB;
+             t1 += rowA1 * rowB;
+           }
+         t = t * valpha;
+         t1 = t1 * valpha;
+#if defined(TRMMKERNEL)
+         CO[0] = t[0];
+         CO[1] = t[1];
+         CO[2] = t1[0];
+         CO[3] = t1[1];
+#else
+         CO[0] += t[0];
+         CO[1] += t[1];
+         CO[2] += t1[0];
+         CO[3] += t1[1];
+#endif
+         AO += temp << 2;
+         BO += temp;
+         CO += 4;
+         i -= 4;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (4, 1)
+#endif
+       }
+      while (i >= 2)
+       {
+         FLOAT *BO;
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (2, 1)
+#else
+          BO = B;
+          temp = k;
+#endif
+         BLASLONG l = 0;
+         v4sf_t t = { 0, 0 };
+         for (l = 0; l < temp; l++)
+           {
+             v4sf_t rowB = { BO[l], BO[l] };
+             v4sf_t rowA = { AO[l << 1], AO[(l << 1) + 1] };
+             t += rowA * rowB;
+           }
+         t = t * valpha;
+#if defined(TRMMKERNEL)
+         CO[0] = t[0];
+         CO[1] = t[1];
+#else
+         CO[0] += t[0];
+         CO[1] += t[1];
+#endif
+         AO += temp << 1;
+         BO += temp;
+         CO += 2;
+         i -= 2;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (2, 1)
+#endif
+       }
+      while (i >= 1)
+       {
+         FLOAT *BO;
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (1, 1)
+#else
+          BO = B;
+          temp = k;
+#endif
+         BLASLONG l = 0;
+         FLOAT t = 0;
+         for (l = 0; l < temp; l++)
+           {
+             t += AO[l] * BO[l];
+           }
+         AO += temp;
+         BO += temp;
+#if defined(TRMMKERNEL)
+         CO[0] = t * alpha;
+#else
+         CO[0] += t * alpha;
+#endif
+         CO += 1;
+         i -= 1;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (1, 1)
+#endif
+       }
+#if defined(TRMMKERNEL) && !defined(LEFT)
+      off += 1;                 // number of values in A
+#endif
+      B += k;
+    }
+  return 0;
+}
diff --git a/kernel/power/sgemm_kernel_power10.c b/kernel/power/sgemm_kernel_power10.c

new file mode 100644 (file)

index 0000000..01c122c
--- /dev/null
+++ b/kernel/power/sgemm_kernel_power10.c
@@ -0,0 +1,1334 @@
+/*********************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+#include "common.h"
+#include <altivec.h>
+
+typedef unsigned char vec_t __attribute__ ((vector_size (16)));
+typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
+typedef FLOAT v2sf_t __attribute__ ((vector_size (8)));
+#if defined(TRMMKERNEL)
+#define SAVE_ACC(ACC, J)  \
+         __builtin_mma_disassemble_acc (result, ACC); \
+         rowC = (v4sf_t *) &CO[0* ldc+J]; \
+          rowC[0] = result[3] * alpha; \
+          rowC = (v4sf_t *) &CO[1*ldc+J]; \
+          rowC[0] = result[2] * alpha; \
+          rowC = (v4sf_t *) &CO[2*ldc+J]; \
+          rowC[0] = result[1] * alpha; \
+          rowC = (v4sf_t *) &CO[3*ldc+J]; \
+          rowC[0] = result[0] * alpha;
+#define SAVE_ACC1(ACC, J)  \
+         __builtin_mma_disassemble_acc (result, ACC); \
+         rowC = (v4sf_t *) &CO[4* ldc+J]; \
+          rowC[0] = result[3] * alpha; \
+          rowC = (v4sf_t *) &CO[5*ldc+J]; \
+          rowC[0] = result[2] * alpha; \
+          rowC = (v4sf_t *) &CO[6*ldc+J]; \
+          rowC[0] = result[1] * alpha; \
+          rowC = (v4sf_t *) &CO[7*ldc+J]; \
+          rowC[0] = result[0] * alpha;
+#define  SAVE4x2_ACC(ACC, J)  \
+         __builtin_mma_disassemble_acc (result, ACC); \
+         rowC = (v2sf_t *) &CO[0* ldc+J]; \
+          rowC[0] = result[6] * alpha; \
+         rowC = (v2sf_t *) &CO[1* ldc+J]; \
+          rowC[0] = result[4] * alpha; \
+         rowC = (v2sf_t *) &CO[2* ldc+J]; \
+          rowC[0] = result[2] * alpha; \
+         rowC = (v2sf_t *) &CO[3* ldc+J]; \
+          rowC[0] = result[0] * alpha;
+#define  SAVE4x2_ACC1(ACC, J)  \
+         __builtin_mma_disassemble_acc (result, ACC); \
+         rowC = (v2sf_t *) &CO[4* ldc+J]; \
+          rowC[0] = result[6] * alpha; \
+         rowC = (v2sf_t *) &CO[5* ldc+J]; \
+          rowC[0] = result[4] * alpha; \
+         rowC = (v2sf_t *) &CO[6* ldc+J]; \
+          rowC[0] = result[2] * alpha; \
+         rowC = (v2sf_t *) &CO[7* ldc+J]; \
+          rowC[0] = result[0] * alpha;
+#define  SAVE2x4_ACC(ACC, J)  \
+         __builtin_mma_disassemble_acc (result, ACC); \
+         rowC = (v4sf_t *) &CO[0* ldc+J]; \
+          rowC[0] = result[3] * alpha; \
+         rowC = (v4sf_t *) &CO[1* ldc+J]; \
+          rowC[0] = result[2] * alpha;
+#else
+#define SAVE_ACC(ACC, J)  \
+         __builtin_mma_disassemble_acc (result, ACC); \
+         rowC = (v4sf_t *) &CO[0* ldc+J]; \
+          rowC[0] += result[3] * alpha; \
+          rowC = (v4sf_t *) &CO[1*ldc+J]; \
+          rowC[0] += result[2] * alpha; \
+          rowC = (v4sf_t *) &CO[2*ldc+J]; \
+          rowC[0] += result[1] * alpha; \
+          rowC = (v4sf_t *) &CO[3*ldc+J]; \
+          rowC[0] += result[0] * alpha;
+#define SAVE_ACC1(ACC, J)  \
+         __builtin_mma_disassemble_acc (result, ACC); \
+         rowC = (v4sf_t *) &CO[4* ldc+J]; \
+          rowC[0] += result[3] * alpha; \
+          rowC = (v4sf_t *) &CO[5*ldc+J]; \
+          rowC[0] += result[2] * alpha; \
+          rowC = (v4sf_t *) &CO[6*ldc+J]; \
+          rowC[0] += result[1] * alpha; \
+          rowC = (v4sf_t *) &CO[7*ldc+J]; \
+          rowC[0] += result[0] * alpha;
+#define  SAVE4x2_ACC(ACC, J)  \
+         __builtin_mma_disassemble_acc (result, ACC); \
+         rowC = (v2sf_t *) &CO[0* ldc+J]; \
+          rowC[0] += result[6] * alpha; \
+         rowC = (v2sf_t *) &CO[1* ldc+J]; \
+          rowC[0] += result[4] * alpha; \
+         rowC = (v2sf_t *) &CO[2* ldc+J]; \
+          rowC[0] += result[2] * alpha; \
+         rowC = (v2sf_t *) &CO[3* ldc+J]; \
+          rowC[0] += result[0] * alpha;
+#define  SAVE4x2_ACC1(ACC, J)  \
+         __builtin_mma_disassemble_acc (result, ACC); \
+         rowC = (v2sf_t *) &CO[4* ldc+J]; \
+          rowC[0] += result[6] * alpha; \
+         rowC = (v2sf_t *) &CO[5* ldc+J]; \
+          rowC[0] += result[4] * alpha; \
+         rowC = (v2sf_t *) &CO[6* ldc+J]; \
+          rowC[0] += result[2] * alpha; \
+         rowC = (v2sf_t *) &CO[7* ldc+J]; \
+          rowC[0] += result[0] * alpha;
+#define  SAVE2x4_ACC(ACC, J)  \
+         __builtin_mma_disassemble_acc (result, ACC); \
+         rowC = (v4sf_t *) &CO[0* ldc+J]; \
+          rowC[0] += result[3] * alpha; \
+         rowC = (v4sf_t *) &CO[1* ldc+J]; \
+          rowC[0] += result[2] * alpha;
+#endif
+#define KERNEL(i, j) \
+          __builtin_mma_xvf32gerpp (&acc0, rowB[i], rowA[j]); \
+          __builtin_mma_xvf32gerpp (&acc1, rowB[i+1], rowA[j]); \
+          __builtin_mma_xvf32gerpp (&acc2, rowB[i], rowA[j+1]); \
+          __builtin_mma_xvf32gerpp (&acc3, rowB[i+1], rowA[j+1]); \
+          __builtin_mma_xvf32gerpp (&acc4, rowB[i], rowA[j+2]); \
+          __builtin_mma_xvf32gerpp (&acc5, rowB[i+1], rowA[j+2]); \
+          __builtin_mma_xvf32gerpp (&acc6, rowB[i], rowA[j+3]); \
+          __builtin_mma_xvf32gerpp (&acc7, rowB[i+1], rowA[j+3]);
+#define SET_ACC_ZERO4() \
+          __builtin_mma_xxsetaccz (&acc0); \
+          __builtin_mma_xxsetaccz (&acc1); \
+          __builtin_mma_xxsetaccz (&acc2); \
+          __builtin_mma_xxsetaccz (&acc3);
+
+#define SET_ACC_ZERO8() \
+          __builtin_mma_xxsetaccz (&acc0); \
+          __builtin_mma_xxsetaccz (&acc1); \
+          __builtin_mma_xxsetaccz (&acc2); \
+          __builtin_mma_xxsetaccz (&acc3); \
+          __builtin_mma_xxsetaccz (&acc4); \
+          __builtin_mma_xxsetaccz (&acc5); \
+          __builtin_mma_xxsetaccz (&acc6); \
+          __builtin_mma_xxsetaccz (&acc7);
+
+#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+#define REFRESH_TEMP_BK(x, y) \
+            temp = k - off;
+#elif defined(LEFT)
+#define REFRESH_TEMP_BK(x, y) \
+            temp = off + x;
+#else
+#define REFRESH_TEMP_BK(x, y) \
+            temp = off + y;
+#endif
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+#define REFRESH_POINTERS(x, y) \
+         BO = B; \
+          REFRESH_TEMP_BK(x, y)
+#else
+#define REFRESH_POINTERS(x, y) \
+          AO += off * x; \
+          BO = B + off * y; \
+          REFRESH_TEMP_BK(x, y)
+#endif
+
+#ifdef LEFT
+#define REFRESH_OFF(x) \
+            off += x;
+#else
+#define REFRESH_OFF(x)
+#endif
+
+#ifdef LEFT
+#define UPDATE_TEMP(x, y) \
+            temp -= x;
+#else
+#define UPDATE_TEMP(x, y) \
+            temp -= y;
+#endif
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+#define REFRESH_TMP_AFTER_SAVE(x, y) \
+            temp = k - off; \
+            UPDATE_TEMP(x, y) \
+            AO += temp * x; \
+            BO += temp * y;
+#else
+#define REFRESH_TMP_AFTER_SAVE(x, y)
+#endif
+
+#define REFRESH_AFTER_SAVE(x,y) \
+        REFRESH_TMP_AFTER_SAVE(x, y) \
+       REFRESH_OFF(x)
+/*************************************************************************************
+* GEMM Kernel
+*************************************************************************************/
+int
+CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
+       FLOAT * C, BLASLONG ldc
+#ifdef TRMMKERNEL
+       , BLASLONG offset
+#endif
+  )
+{
+  BLASLONG N = n;
+  BLASLONG i1;
+#if defined(TRMMKERNEL)
+  BLASLONG off;
+#endif
+#if defined(TRMMKERNEL) && !defined(LEFT)
+  off = -offset;
+#endif
+
+  v4sf_t valpha = { alpha, alpha, alpha, alpha };
+  N = n >> 3;
+  for (i1 = 0; i1 < N; i1++)
+    {
+      BLASLONG i, j, temp;
+      FLOAT *CO;
+      FLOAT *AO;
+#if defined(TRMMKERNEL) && defined(LEFT)
+      off = offset;
+#endif
+      CO = C;
+      C += ldc << 3;
+      AO = A;
+      PREFETCH1 (A, 128);
+      PREFETCH1 (A, 256);
+      i = m >> 4;
+      for (j = 0; j < i; j++)
+       {
+         FLOAT *BO;
+#if defined(TRMMKERNEL)
+         REFRESH_POINTERS (16, 8);
+#else
+         BO = B;
+         temp = k;
+#endif
+         v4sf_t *rowC;
+         v4sf_t result[4];
+         __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
+         SET_ACC_ZERO8 ();
+         BLASLONG l = 0;
+         BLASLONG K = temp / 64;
+         for (l = 0; l < K; l++)
+           {
+             vec_t *rowA = (vec_t *) & AO[0];
+             vec_t *rowB = (vec_t *) & BO[0];
+             KERNEL (0, 0);
+             KERNEL (2, 4);
+             KERNEL (4, 8);
+             KERNEL (6, 12);
+             KERNEL (8, 16);
+             KERNEL (10, 20);
+             KERNEL (12, 24);
+             KERNEL (14, 28);
+             KERNEL (16, 32);
+             KERNEL (18, 36);
+             KERNEL (20, 40);
+             KERNEL (22, 44);
+             KERNEL (24, 48);
+             KERNEL (26, 52);
+             KERNEL (28, 56);
+             KERNEL (30, 60);
+             KERNEL (32, 64);
+             KERNEL (34, 68);
+             KERNEL (36, 72);
+             KERNEL (38, 76);
+             KERNEL (40, 80);
+             KERNEL (42, 84);
+             KERNEL (44, 88);
+             KERNEL (46, 92);
+             KERNEL (48, 96);
+             KERNEL (50, 100);
+             KERNEL (52, 104);
+             KERNEL (54, 108);
+             KERNEL (56, 112);
+             KERNEL (58, 116);
+             KERNEL (60, 120);
+             KERNEL (62, 124);
+             KERNEL (64, 128);
+             KERNEL (66, 132);
+             KERNEL (68, 136);
+             KERNEL (70, 140);
+             KERNEL (72, 144);
+             KERNEL (74, 148);
+             KERNEL (76, 152);
+             KERNEL (78, 156);
+             KERNEL (80, 160);
+             KERNEL (82, 164);
+             KERNEL (84, 168);
+             KERNEL (86, 172);
+             KERNEL (88, 176);
+             KERNEL (90, 180);
+             KERNEL (92, 184);
+             KERNEL (94, 188);
+             KERNEL (96, 192);
+             KERNEL (98, 196);
+             KERNEL (100, 200);
+             KERNEL (102, 204);
+             KERNEL (104, 208);
+             KERNEL (106, 212);
+             KERNEL (108, 216);
+             KERNEL (110, 220);
+             KERNEL (112, 224);
+             KERNEL (114, 228);
+             KERNEL (116, 232);
+             KERNEL (118, 236);
+             KERNEL (120, 240);
+             KERNEL (122, 244);
+             KERNEL (124, 248);
+             KERNEL (126, 252);
+             AO += 1024;
+             BO += 512;
+           }
+         if ((temp & 63) >> 5)
+           {
+             vec_t *rowA = (vec_t *) & AO[0];
+             vec_t *rowB = (vec_t *) & BO[0];
+             KERNEL (0, 0);
+             KERNEL (2, 4);
+             KERNEL (4, 8);
+             KERNEL (6, 12);
+             KERNEL (8, 16);
+             KERNEL (10, 20);
+             KERNEL (12, 24);
+             KERNEL (14, 28);
+             KERNEL (16, 32);
+             KERNEL (18, 36);
+             KERNEL (20, 40);
+             KERNEL (22, 44);
+             KERNEL (24, 48);
+             KERNEL (26, 52);
+             KERNEL (28, 56);
+             KERNEL (30, 60);
+             KERNEL (32, 64);
+             KERNEL (34, 68);
+             KERNEL (36, 72);
+             KERNEL (38, 76);
+             KERNEL (40, 80);
+             KERNEL (42, 84);
+             KERNEL (44, 88);
+             KERNEL (46, 92);
+             KERNEL (48, 96);
+             KERNEL (50, 100);
+             KERNEL (52, 104);
+             KERNEL (54, 108);
+             KERNEL (56, 112);
+             KERNEL (58, 116);
+             KERNEL (60, 120);
+             KERNEL (62, 124);
+             AO += 512;
+             BO += 256;
+           }
+         if ((temp & 31) >> 4)
+           {
+             vec_t *rowA = (vec_t *) & AO[0];
+             vec_t *rowB = (vec_t *) & BO[0];
+             KERNEL (0, 0);
+             KERNEL (2, 4);
+             KERNEL (4, 8);
+             KERNEL (6, 12);
+             KERNEL (8, 16);
+             KERNEL (10, 20);
+             KERNEL (12, 24);
+             KERNEL (14, 28);
+             KERNEL (16, 32);
+             KERNEL (18, 36);
+             KERNEL (20, 40);
+             KERNEL (22, 44);
+             KERNEL (24, 48);
+             KERNEL (26, 52);
+             KERNEL (28, 56);
+             KERNEL (30, 60);
+             AO += 256;
+             BO += 128;
+           }
+         if ((temp & 15) >> 3)
+           {
+             vec_t *rowA = (vec_t *) & AO[0];
+             vec_t *rowB = (vec_t *) & BO[0];
+             KERNEL (0, 0);
+             KERNEL (2, 4);
+             KERNEL (4, 8);
+             KERNEL (6, 12);
+             KERNEL (8, 16);
+             KERNEL (10, 20);
+             KERNEL (12, 24);
+             KERNEL (14, 28);
+             AO += 128;
+             BO += 64;
+           }
+         if ((temp & 7) >> 2)
+           {
+             vec_t *rowA = (vec_t *) & AO[0];
+             vec_t *rowB = (vec_t *) & BO[0];
+             KERNEL (0, 0);
+             KERNEL (2, 4);
+             KERNEL (4, 8);
+             KERNEL (6, 12);
+             AO += 64;
+             BO += 32;
+           }
+         if ((temp & 3) >> 1)
+           {
+             vec_t *rowA = (vec_t *) & AO[0];
+             vec_t *rowB = (vec_t *) & BO[0];
+             KERNEL (0, 0);
+             KERNEL (2, 4);
+             AO += 32;
+             BO += 16;
+           }
+         if ((temp & 1) >> 0)
+           {
+             vec_t *rowA = (vec_t *) & AO[0];
+             vec_t *rowB = (vec_t *) & BO[0];
+             KERNEL (0, 0);
+             AO += 16;
+             BO += 8;
+           }
+         SAVE_ACC (&acc0, 0);
+         SAVE_ACC (&acc2, 4);
+         SAVE_ACC1 (&acc1, 0);
+         SAVE_ACC1 (&acc3, 4);
+         SAVE_ACC (&acc4, 8);
+         SAVE_ACC (&acc6, 12);
+         SAVE_ACC1 (&acc5, 8);
+         SAVE_ACC1 (&acc7, 12);
+#if defined(TRMMKERNEL)
+         REFRESH_AFTER_SAVE (16, 8)
+#endif
+           CO += 16;
+       }
+      i = (m & 15) >> 3;
+      for (j = 0; j < i; j++)
+       {
+         FLOAT *BO;
+#if defined(TRMMKERNEL)
+         REFRESH_POINTERS (8, 8);
+#else
+         BO = B;
+         temp = k;
+#endif
+         v4sf_t *rowC;
+         v4sf_t result[4];
+         __vector_quad acc0, acc1, acc2, acc3;
+         SET_ACC_ZERO4 ();
+         BLASLONG l = 0;
+         for (l = 0; l < temp; l++)
+           {
+             vec_t *rowA = (vec_t *) & AO[l << 3];
+             vec_t *rowB = (vec_t *) & BO[l << 3];
+             __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
+             __builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]);
+             __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[1]);
+             __builtin_mma_xvf32gerpp (&acc3, rowB[1], rowA[1]);
+           }
+         SAVE_ACC (&acc0, 0);
+         SAVE_ACC (&acc2, 4);
+         SAVE_ACC1 (&acc1, 0);
+         SAVE_ACC1 (&acc3, 4);
+         AO += (temp << 3);
+         BO += (temp << 3);
+         CO += 8;
+#if defined(TRMMKERNEL)
+         REFRESH_AFTER_SAVE (8, 8)
+#endif
+       }
+      i = (m & 7) >> 2;
+      for (j = 0; j < i; j++)
+       {
+         FLOAT *BO;
+#if defined(TRMMKERNEL)
+         REFRESH_POINTERS (4, 8);
+#else
+         BO = B;
+         temp = k;
+#endif
+         v4sf_t *rowC;
+         v4sf_t result[4];
+         __vector_quad acc0, acc1;
+         __builtin_mma_xxsetaccz (&acc0);
+         __builtin_mma_xxsetaccz (&acc1);
+         BLASLONG l = 0;
+         for (l = 0; l < temp; l++)
+           {
+             vec_t *rowA = (vec_t *) & AO[l << 2];
+             vec_t *rowB = (vec_t *) & BO[l << 3];
+             __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
+             __builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]);
+           }
+         SAVE_ACC (&acc0, 0);
+         SAVE_ACC1 (&acc1, 0);
+         CO += 4;
+         AO += (temp << 2);
+         BO += (temp << 3);
+#if defined(TRMMKERNEL)
+         REFRESH_AFTER_SAVE (4, 8)
+#endif
+       }
+      i = (m & 3) >> 1;
+      for (j = 0; j < i; j++)
+       {
+         FLOAT *BO;
+#if defined(TRMMKERNEL)
+         REFRESH_POINTERS (2, 8);
+#else
+         BO = B;
+         temp = k;
+#endif
+
+         v2sf_t *rowC;
+         v2sf_t result[8];
+         __vector_quad acc0, acc1;
+         __builtin_mma_xxsetaccz (&acc0);
+         __builtin_mma_xxsetaccz (&acc1);
+         BLASLONG l = 0;
+         for (l = 0; l < temp; l++)
+           {
+             FLOAT t[4] = { 0 };
+             t[0] = AO[l << 1], t[1] = AO[(l << 1) + 1];
+             vec_t *rowA = (vec_t *) & t[0];
+             vec_t *rowB = (vec_t *) & BO[l << 3];
+             __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
+             __builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]);
+           }
+         SAVE4x2_ACC (&acc0, 0);
+         SAVE4x2_ACC1 (&acc1, 0);
+         CO += 2;
+         AO += (temp << 1);
+         BO += (temp << 3);
+#if defined(TRMMKERNEL)
+         REFRESH_AFTER_SAVE (2, 8)
+#endif
+       }
+      i = (m & 1) >> 0;
+      for (j = 0; j < i; j++)
+       {
+         FLOAT *BO;
+#if defined(TRMMKERNEL)
+         REFRESH_POINTERS (1, 8);
+#else
+         BO = B;
+         temp = k;
+#endif
+         BLASLONG l = 0;
+         v4sf_t t = { 0, 0, 0, 0 };
+         v4sf_t t1 = { 0, 0, 0, 0 };
+         for (l = 0; l < temp; l++)
+           {
+             v4sf_t rowA = { AO[l], AO[l], AO[l], AO[l] };
+             v4sf_t rowB = { BO[l << 3], BO[(l << 3) + 1], BO[(l << 3) + 2],
+               BO[(l << 3) + 3]
+             };
+             v4sf_t rowB1 =
+               { BO[(l << 3) + 4], BO[(l << 3) + 5], BO[(l << 3) + 6],
+               BO[(l << 3) + 7]
+             };
+             t += rowA * rowB;
+             t1 += rowA * rowB1;
+           }
+         t = t * valpha;
+         t1 = t1 * valpha;
+#if defined(TRMMKERNEL)
+         CO[0 * ldc] = t[0];
+         CO[1 * ldc] = t[1];
+         CO[2 * ldc] = t[2];
+         CO[3 * ldc] = t[3];
+         CO[4 * ldc] = t1[0];
+         CO[5 * ldc] = t1[1];
+         CO[6 * ldc] = t1[2];
+         CO[7 * ldc] = t1[3];
+#else
+         CO[0 * ldc] += t[0];
+         CO[1 * ldc] += t[1];
+         CO[2 * ldc] += t[2];
+         CO[3 * ldc] += t[3];
+         CO[4 * ldc] += t1[0];
+         CO[5 * ldc] += t1[1];
+         CO[6 * ldc] += t1[2];
+         CO[7 * ldc] += t1[3];
+#endif
+         CO += 1;
+         AO += temp;
+         BO += (temp << 3);
+#if defined(TRMMKERNEL)
+         REFRESH_AFTER_SAVE (1, 8)
+#endif
+       }
+#if defined(TRMMKERNEL) && !defined(LEFT)
+      off += 8;                        // number of values in A
+#endif
+
+      B += k << 3;
+    }
+  N = (n & 7) >> 2;
+  for (i1 = 0; i1 < N; i1++)
+    {
+      BLASLONG i, j, temp;
+#if defined(TRMMKERNEL) && defined(LEFT)
+      off = offset;
+#endif
+      FLOAT *CO;
+      FLOAT *AO;
+      CO = C;
+      C += ldc << 2;
+      AO = A;
+#if !defined(TRMMKERNEL)
+      i = m >> 5;
+      for (j = 0; j < i; j++)
+       {
+         FLOAT *BO = B;
+         v4sf_t *rowC;
+         v4sf_t result[4];
+         FLOAT *A1;
+         A1 = AO + (16 * k);
+         __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
+         SET_ACC_ZERO8 ();
+         BLASLONG l = 0;
+         for (l = 0; l < k; l++)
+           {
+             vec_t *rowA = (vec_t *) & AO[l << 4];
+             vec_t *rowA1 = (vec_t *) & A1[l << 4];
+             vec_t *rowB = (vec_t *) & BO[l << 2];
+             __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
+             __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
+             __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]);
+             __builtin_mma_xvf32gerpp (&acc3, rowB[0], rowA[3]);
+             __builtin_mma_xvf32gerpp (&acc4, rowB[0], rowA1[0]);
+             __builtin_mma_xvf32gerpp (&acc5, rowB[0], rowA1[1]);
+             __builtin_mma_xvf32gerpp (&acc6, rowB[0], rowA1[2]);
+             __builtin_mma_xvf32gerpp (&acc7, rowB[0], rowA1[3]);
+           }
+
+         SAVE_ACC (&acc0, 0);
+         SAVE_ACC (&acc1, 4);
+         CO += 8;
+         SAVE_ACC (&acc2, 0);
+         SAVE_ACC (&acc3, 4);
+         CO += 8;
+         SAVE_ACC (&acc4, 0);
+         SAVE_ACC (&acc5, 4);
+         CO += 8;
+         SAVE_ACC (&acc6, 0);
+         SAVE_ACC (&acc7, 4);
+         CO += 8;
+         AO += k << 5;
+         BO += k << 2;
+       }
+      i = (m & 31) >> 4;
+#else
+      i = m >> 4;
+#endif
+      for (j = 0; j < i; j++)
+       {
+         FLOAT *BO;
+#if defined(TRMMKERNEL)
+         REFRESH_POINTERS (16, 4);
+#else
+         BO = B;
+         temp = k;
+#endif
+         v4sf_t *rowC;
+         v4sf_t result[4];
+         __vector_quad acc0, acc1, acc2, acc3;
+         SET_ACC_ZERO4 ();
+         BLASLONG l = 0;
+         for (l = 0; l < temp; l++)
+           {
+             vec_t *rowA = (vec_t *) & AO[l << 4];
+             vec_t *rowB = (vec_t *) & BO[l << 2];
+             __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
+             __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
+             __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]);
+             __builtin_mma_xvf32gerpp (&acc3, rowB[0], rowA[3]);
+           }
+
+         SAVE_ACC (&acc0, 0);
+         SAVE_ACC (&acc1, 4);
+         CO += 8;
+         SAVE_ACC (&acc2, 0);
+         SAVE_ACC (&acc3, 4);
+         CO += 8;
+         AO += temp << 4;
+         BO += temp << 2;
+#if defined(TRMMKERNEL)
+         REFRESH_AFTER_SAVE (16, 4)
+#endif
+       }
+      i = (m & 15) >> 3;
+      for (j = 0; j < i; j++)
+       {
+         FLOAT *BO;
+#if defined(TRMMKERNEL)
+         REFRESH_POINTERS (8, 4);
+#else
+         BO = B;
+         temp = k;
+#endif
+         v4sf_t *rowC;
+         v4sf_t result[4];
+         __vector_quad acc0, acc1;
+         __builtin_mma_xxsetaccz (&acc0);
+         __builtin_mma_xxsetaccz (&acc1);
+         BLASLONG l = 0;
+         for (l = 0; l < temp; l++)
+           {
+             vec_t *rowA = (vec_t *) & AO[l << 3];
+             vec_t *rowB = (vec_t *) & BO[l << 2];
+             __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
+             __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
+           }
+         SAVE_ACC (&acc0, 0);
+         SAVE_ACC (&acc1, 4);
+         CO += 8;
+         AO += temp << 3;
+         BO += temp << 2;
+#if defined(TRMMKERNEL)
+         REFRESH_AFTER_SAVE (8, 4)
+#endif
+       }
+      i = (m & 7) >> 2;
+      for (j = 0; j < i; j++)
+       {
+         FLOAT *BO;
+#if defined(TRMMKERNEL)
+         REFRESH_POINTERS (4, 4);
+#else
+         BO = B;
+         temp = k;
+#endif
+         v4sf_t *rowC;
+         __vector_quad acc0;
+         v4sf_t result[4];
+         __builtin_mma_xxsetaccz (&acc0);
+         BLASLONG l = 0;
+         for (l = 0; l < temp; l++)
+           {
+             vec_t *rowA = (vec_t *) & AO[l << 2];
+             vec_t *rowB = (vec_t *) & BO[l << 2];
+             __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
+           }
+         SAVE_ACC (&acc0, 0);
+         CO += 4;
+         AO += temp << 2;
+         BO += temp << 2;
+#if defined(TRMMKERNEL)
+         REFRESH_AFTER_SAVE (4, 4)
+#endif
+       }
+      i = (m & 3) >> 1;
+      for (j = 0; j < i; j++)
+       {
+         FLOAT *BO;
+#if defined(TRMMKERNEL)
+         REFRESH_POINTERS (2, 4);
+#else
+         BO = B;
+         temp = k;
+#endif
+         v2sf_t *rowC;
+         v2sf_t result[8];
+         __vector_quad acc0;
+         __builtin_mma_xxsetaccz (&acc0);
+         BLASLONG l = 0;
+         for (l = 0; l < temp; l++)
+           {
+             FLOAT t[4] = { 0 };
+             t[0] = AO[l << 1], t[1] = AO[(l << 1) + 1];
+             vec_t *rowA = (vec_t *) & t[0];
+             vec_t *rowB = (vec_t *) & BO[l << 2];
+             __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
+           }
+         SAVE4x2_ACC (&acc0, 0);
+         CO += 2;
+         AO += temp << 1;
+         BO += temp << 2;
+#if defined(TRMMKERNEL)
+         REFRESH_AFTER_SAVE (2, 4)
+#endif
+       }
+      i = (m & 1) >> 0;
+      for (j = 0; j < i; j++)
+       {
+         FLOAT *BO;
+#if defined(TRMMKERNEL)
+         REFRESH_POINTERS (1, 4)
+#else
+         BO = B;
+         temp = k;
+#endif
+         BLASLONG l = 0;
+         v4sf_t t = { 0, 0, 0, 0 };
+         for (l = 0; l < temp; l++)
+           {
+             v4sf_t rowA = { AO[l], AO[l], AO[l], AO[l] };
+             v4sf_t rowB = { BO[l << 2], BO[(l << 2) + 1], BO[(l << 2) + 2],
+               BO[(l << 2) + 3]
+             };
+             t += rowA * rowB;
+           }
+         t = t * valpha;
+#if defined(TRMMKERNEL)
+         CO[0 * ldc] = t[0];
+         CO[1 * ldc] = t[1];
+         CO[2 * ldc] = t[2];
+         CO[3 * ldc] = t[3];
+#else
+         CO[0 * ldc] += t[0];
+         CO[1 * ldc] += t[1];
+         CO[2 * ldc] += t[2];
+         CO[3 * ldc] += t[3];
+#endif
+         CO += 1;
+         AO += temp;
+         BO += temp << 2;
+#if defined(TRMMKERNEL)
+         REFRESH_AFTER_SAVE (1, 4)
+#endif
+       }
+#if defined(TRMMKERNEL) && !defined(LEFT)
+      off += 4;                        // number of values in A
+#endif
+
+      B += k << 2;
+    }
+  N = (n & 3) >> 1;
+  for (i1 = 0; i1 < N; i1++)
+    {
+      BLASLONG i, j, temp;
+#if defined(TRMMKERNEL) && defined(LEFT)
+      off = offset;
+#endif
+      FLOAT *CO;
+      FLOAT *AO;
+      CO = C;
+      C += ldc << 1;
+      AO = A;
+#if !defined(TRMMKERNEL)
+      i = m >> 5;
+      for (j = 0; j < i; j++)
+       {
+         FLOAT *BO = B;
+         v4sf_t *rowC;
+         v4sf_t result[4];
+         FLOAT *A1;
+         A1 = AO + (16 * k);
+         __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
+         SET_ACC_ZERO8 ();
+         BLASLONG l = 0;
+         for (l = 0; l < k; l++)
+           {
+             FLOAT t[4] = { 0 };
+             t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
+             vec_t *rowB = (vec_t *) & t[0];
+             vec_t *rowA = (vec_t *) & AO[l << 4];
+             vec_t *rowA1 = (vec_t *) & A1[l << 4];
+             __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
+             __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
+             __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]);
+             __builtin_mma_xvf32gerpp (&acc3, rowB[0], rowA[3]);
+             __builtin_mma_xvf32gerpp (&acc4, rowB[0], rowA1[0]);
+             __builtin_mma_xvf32gerpp (&acc5, rowB[0], rowA1[1]);
+             __builtin_mma_xvf32gerpp (&acc6, rowB[0], rowA1[2]);
+             __builtin_mma_xvf32gerpp (&acc7, rowB[0], rowA1[3]);
+           }
+         SAVE2x4_ACC (&acc0, 0);
+         SAVE2x4_ACC (&acc1, 4);
+         SAVE2x4_ACC (&acc2, 8);
+         SAVE2x4_ACC (&acc3, 12);
+         CO += 16;
+         SAVE2x4_ACC (&acc4, 0);
+         SAVE2x4_ACC (&acc5, 4);
+         SAVE2x4_ACC (&acc6, 8);
+         SAVE2x4_ACC (&acc7, 12);
+         CO += 16;
+         AO += k << 5;
+         BO += k << 1;
+       }
+      i = (m & 31) >> 4;
+#else
+      i = m >> 4;
+#endif
+      for (j = 0; j < i; j++)
+       {
+         FLOAT *BO;
+         v4sf_t *rowC;
+         v4sf_t result[4];
+         __vector_quad acc0, acc1, acc2, acc3;
+         SET_ACC_ZERO4 ();
+         BLASLONG l = 0;
+#if defined(TRMMKERNEL)
+         REFRESH_POINTERS (16, 2)
+#else
+         BO = B;
+         temp = k;
+#endif
+         for (l = 0; l < temp; l++)
+           {
+             FLOAT t[4] = { 0 };
+             t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
+             vec_t *rowB = (vec_t *) & t[0];
+             vec_t *rowA = (vec_t *) & AO[l << 4];
+             __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
+             __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
+             __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]);
+             __builtin_mma_xvf32gerpp (&acc3, rowB[0], rowA[3]);
+           }
+         SAVE2x4_ACC (&acc0, 0);
+         SAVE2x4_ACC (&acc1, 4);
+         SAVE2x4_ACC (&acc2, 8);
+         SAVE2x4_ACC (&acc3, 12);
+         CO += 16;
+         AO += temp << 4;
+         BO += temp << 1;
+#if defined(TRMMKERNEL)
+         REFRESH_AFTER_SAVE (16, 2)
+#endif
+       }
+      i = (m & 15) >> 3;
+      for (j = 0; j < i; j++)
+       {
+         FLOAT *BO;
+         v4sf_t *rowC;
+         v4sf_t result[4];
+         __vector_quad acc0, acc1;
+         __builtin_mma_xxsetaccz (&acc0);
+         __builtin_mma_xxsetaccz (&acc1);
+#if defined(TRMMKERNEL)
+         REFRESH_POINTERS (8, 2)
+#else
+         BO = B;
+         temp = k;
+#endif
+         BLASLONG l = 0;
+         for (l = 0; l < temp; l++)
+           {
+             FLOAT t[4] = { 0 };
+             t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
+             vec_t *rowB = (vec_t *) & t[0];
+             vec_t *rowA = (vec_t *) & AO[l << 3];
+             __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
+             __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
+           }
+         SAVE2x4_ACC (&acc0, 0);
+         SAVE2x4_ACC (&acc1, 4);
+         CO += 8;
+         AO += temp << 3;
+         BO += temp << 1;
+#if defined(TRMMKERNEL)
+         REFRESH_AFTER_SAVE (8, 2)
+#endif
+       }
+      i = (m & 7) >> 2;
+      for (j = 0; j < i; j++)
+       {
+         FLOAT *BO;
+         v4sf_t *rowC;
+         v4sf_t result[4];
+         __vector_quad acc0;
+         __builtin_mma_xxsetaccz (&acc0);
+#if defined(TRMMKERNEL)
+         REFRESH_POINTERS (4, 2)
+#else
+         BO = B;
+         temp = k;
+#endif
+         BLASLONG l = 0;
+         for (l = 0; l < temp; l++)
+           {
+             FLOAT t[4] = { 0 };
+             t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
+             vec_t *rowB = (vec_t *) & t[0];
+             vec_t *rowA = (vec_t *) & AO[l << 2];
+             __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
+           }
+         SAVE2x4_ACC (&acc0, 0);
+         CO += 4;
+         AO += temp << 2;
+         BO += temp << 1;
+#if defined(TRMMKERNEL)
+         REFRESH_AFTER_SAVE (4, 2)
+#endif
+       }
+      i = (m & 3) >> 1;
+      for (j = 0; j < i; j++)
+       {
+         FLOAT *BO;
+         BLASLONG l = 0;
+#if defined(TRMMKERNEL)
+         REFRESH_POINTERS (2, 2)
+#else
+         BO = B;
+         temp = k;
+#endif
+         v4sf_t t = { 0, 0, 0, 0 };
+         for (l = 0; l < (temp << 1); l += 2)
+           {
+             v4sf_t rowA = { AO[l], AO[l], AO[l + 1], AO[l + 1] };
+             v4sf_t rowB = { BO[l], BO[l + 1], BO[l], BO[l + 1] };
+             t += rowA * rowB;
+           }
+         t = t * valpha;
+#if defined(TRMMKERNEL)
+         CO[0 * ldc] = t[0];
+         CO[1 * ldc] = t[1];
+         CO[0 * ldc + 1] = t[2];
+         CO[1 * ldc + 1] = t[3];
+#else
+         CO[0 * ldc] += t[0];
+         CO[1 * ldc] += t[1];
+         CO[0 * ldc + 1] += t[2];
+         CO[1 * ldc + 1] += t[3];
+#endif
+         CO += 2;
+         AO += temp << 1;
+         BO += temp << 1;
+#if defined(TRMMKERNEL)
+         REFRESH_AFTER_SAVE (2, 2)
+#endif
+       }
+      i = (m & 1) >> 0;
+      for (j = 0; j < i; j++)
+       {
+         FLOAT *BO;
+         BLASLONG l = 0;
+#if defined(TRMMKERNEL)
+         REFRESH_POINTERS (1, 2)
+#else
+         BO = B;
+         temp = k;
+#endif
+         v4sf_t t = { 0, 0, 0, 0 };
+         for (l = 0; l < temp; l++)
+           {
+             v4sf_t rowA = { AO[l], AO[l], 0, 0 };
+             v4sf_t rowB = { BO[l << 1], BO[(l << 1) + 1], 0, 0 };
+             t += rowA * rowB;
+           }
+         t = t * valpha;
+#if defined(TRMMKERNEL)
+         CO[0 * ldc] = t[0];
+         CO[1 * ldc] = t[1];
+#else
+         CO[0 * ldc] += t[0];
+         CO[1 * ldc] += t[1];
+#endif
+         CO += 1;
+         AO += temp;
+         BO += temp << 1;
+#if defined(TRMMKERNEL)
+         REFRESH_AFTER_SAVE (1, 2)
+#endif
+       }
+#if defined(TRMMKERNEL) && !defined(LEFT)
+      off += 2;                        // number of values in A
+#endif
+
+      B += k << 1;
+    }
+  N = (n & 1) >> 0;
+  for (i1 = 0; i1 < N; i1++)
+    {
+      BLASLONG i, temp;
+#if defined(TRMMKERNEL) && defined(LEFT)
+      off = offset;
+#endif
+      FLOAT *CO;
+      FLOAT *AO;
+      CO = C;
+      C += ldc;
+      AO = A;
+      i = m;
+      while (i >= 16)
+       {
+         FLOAT *BO;
+         BLASLONG l = 0;
+#if defined(TRMMKERNEL)
+         REFRESH_POINTERS (16, 1)
+#else
+         BO = B;
+         temp = k;
+#endif
+
+         v4sf_t t = { 0, 0, 0, 0 };
+         v4sf_t t1 = { 0, 0, 0, 0 };
+         v4sf_t t2 = { 0, 0, 0, 0 };
+         v4sf_t t3 = { 0, 0, 0, 0 };
+         for (l = 0; l < temp; l++)
+           {
+             v4sf_t rowB = { BO[l], BO[l], BO[l], BO[l] };
+             v4sf_t rowA = { AO[l << 4], AO[(l << 4) + 1], AO[(l << 4) + 2],
+               AO[(l << 4) + 3]
+             };
+             v4sf_t rowA1 =
+               { AO[(l << 4) + 4], AO[(l << 4) + 5], AO[(l << 4) + 6],
+               AO[(l << 4) + 7]
+             };
+             v4sf_t rowA2 =
+               { AO[(l << 4) + 8], AO[(l << 4) + 9], AO[(l << 4) + 10],
+               AO[(l << 4) + 11]
+             };
+             v4sf_t rowA3 =
+               { AO[(l << 4) + 12], AO[(l << 4) + 13], AO[(l << 4) + 14],
+               AO[(l << 4) + 15]
+             };
+             t += rowA * rowB;
+             t1 += rowA1 * rowB;
+             t2 += rowA2 * rowB;
+             t3 += rowA3 * rowB;
+           }
+         t = t * valpha;
+         t1 = t1 * valpha;
+         t2 = t2 * valpha;
+         t3 = t3 * valpha;
+#if defined(TRMMKERNEL)
+         CO[0] = t[0];
+         CO[1] = t[1];
+         CO[2] = t[2];
+         CO[3] = t[3];
+         CO[4] = t1[0];
+         CO[5] = t1[1];
+         CO[6] = t1[2];
+         CO[7] = t1[3];
+         CO[8] = t2[0];
+         CO[9] = t2[1];
+         CO[10] = t2[2];
+         CO[11] = t2[3];
+         CO[12] = t3[0];
+         CO[13] = t3[1];
+         CO[14] = t3[2];
+         CO[15] = t3[3];
+#else
+         CO[0] += t[0];
+         CO[1] += t[1];
+         CO[2] += t[2];
+         CO[3] += t[3];
+         CO[4] += t1[0];
+         CO[5] += t1[1];
+         CO[6] += t1[2];
+         CO[7] += t1[3];
+         CO[8] += t2[0];
+         CO[9] += t2[1];
+         CO[10] += t2[2];
+         CO[11] += t2[3];
+         CO[12] += t3[0];
+         CO[13] += t3[1];
+         CO[14] += t3[2];
+         CO[15] += t3[3];
+#endif
+         AO += temp << 4;
+         BO += temp;
+         CO += 16;
+         i -= 16;
+#if defined(TRMMKERNEL)
+         REFRESH_AFTER_SAVE (16, 1)
+#endif
+       }
+      while (i >= 8)
+       {
+         FLOAT *BO;
+         BLASLONG l = 0;
+         v4sf_t t = { 0, 0, 0, 0 };
+         v4sf_t t1 = { 0, 0, 0, 0 };
+#if defined(TRMMKERNEL)
+         REFRESH_POINTERS (8, 1)
+#else
+         BO = B;
+         temp = k;
+#endif
+
+         for (l = 0; l < temp; l++)
+           {
+             v4sf_t rowB = { BO[l], BO[l], BO[l], BO[l] };
+             v4sf_t rowA = { AO[l << 3], AO[(l << 3) + 1], AO[(l << 3) + 2],
+               AO[(l << 3) + 3]
+             };
+             v4sf_t rowA1 =
+               { AO[(l << 3) + 4], AO[(l << 3) + 5], AO[(l << 3) + 6],
+               AO[(l << 3) + 7]
+             };
+             t += rowA * rowB;
+             t1 += rowA1 * rowB;
+           }
+         t = t * valpha;
+         t1 = t1 * valpha;
+#if defined(TRMMKERNEL)
+         CO[0] = t[0];
+         CO[1] = t[1];
+         CO[2] = t[2];
+         CO[3] = t[3];
+         CO[4] = t1[0];
+         CO[5] = t1[1];
+         CO[6] = t1[2];
+         CO[7] = t1[3];
+#else
+         CO[0] += t[0];
+         CO[1] += t[1];
+         CO[2] += t[2];
+         CO[3] += t[3];
+         CO[4] += t1[0];
+         CO[5] += t1[1];
+         CO[6] += t1[2];
+         CO[7] += t1[3];
+#endif
+         AO += temp << 3;
+         BO += temp;
+         CO += 8;
+         i -= 8;
+#if defined(TRMMKERNEL)
+         REFRESH_AFTER_SAVE (8, 1)
+#endif
+       }
+      while (i >= 4)
+       {
+         FLOAT *BO;
+         BLASLONG l = 0;
+         v4sf_t t = { 0, 0, 0, 0 };
+#if defined(TRMMKERNEL)
+         REFRESH_POINTERS (4, 1)
+#else
+         BO = B;
+         temp = k;
+#endif
+
+         for (l = 0; l < temp; l++)
+           {
+             v4sf_t rowB = { BO[l], BO[l], BO[l], BO[l] };
+             v4sf_t rowA = { AO[l << 2], AO[(l << 2) + 1], AO[(l << 2) + 2],
+               AO[(l << 2) + 3]
+             };
+             t += rowA * rowB;
+           }
+         t = t * valpha;
+#if defined(TRMMKERNEL)
+         CO[0] = t[0];
+         CO[1] = t[1];
+         CO[2] = t[2];
+         CO[3] = t[3];
+#else
+         CO[0] += t[0];
+         CO[1] += t[1];
+         CO[2] += t[2];
+         CO[3] += t[3];
+#endif
+         AO += temp << 2;
+         BO += temp;
+         CO += 4;
+         i -= 4;
+#if defined(TRMMKERNEL)
+         REFRESH_AFTER_SAVE (4, 1)
+#endif
+       }
+      while (i >= 2)
+       {
+         FLOAT *BO;
+         BLASLONG l = 0;
+#if defined(TRMMKERNEL)
+         REFRESH_POINTERS (2, 1)
+#else
+         BO = B;
+         temp = k;
+#endif
+
+         v4sf_t t = { 0, 0, 0, 0 };
+         for (l = 0; l < temp; l++)
+           {
+             v4sf_t rowB = { BO[l], BO[l], 0, 0 };
+             v4sf_t rowA = { AO[l << 1], AO[(l << 1) + 1], 0, 0 };
+             t += rowA * rowB;
+           }
+         t = t * valpha;
+#if defined(TRMMKERNEL)
+         CO[0] = t[0];
+         CO[1] = t[1];
+#else
+         CO[0] += t[0];
+         CO[1] += t[1];
+#endif
+         AO += temp << 1;
+         BO += temp;
+         CO += 2;
+         i -= 2;
+#if defined(TRMMKERNEL)
+         REFRESH_AFTER_SAVE (2, 1)
+#endif
+       }
+      while (i >= 1)
+       {
+         FLOAT *BO;
+#if defined(TRMMKERNEL)
+         REFRESH_POINTERS (1, 1)
+#else
+         BO = B;
+         temp = k;
+#endif
+
+         BLASLONG l = 0;
+         FLOAT t = 0;
+         for (l = 0; l < temp; l++)
+           {
+             t += AO[l] * BO[l];
+           }
+         AO += temp;
+         BO += temp;
+#if defined(TRMMKERNEL)
+         CO[0] = t * alpha;
+#else
+         CO[0] += t * alpha;
+#endif
+         CO += 1;
+         i -= 1;
+#if defined(TRMMKERNEL)
+         REFRESH_AFTER_SAVE (1, 1)
+#endif
+       }
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+      off += 1;                        // number of values in A
+#endif
+      B += k;
+    }
+  return 0;
+}
author	Rajalakshmi Srinivasaraghavan <rajis@linux.ibm.com>
	Wed, 24 Jun 2020 19:48:15 +0000 (14:48 -0500)
committer	Rajalakshmi Srinivasaraghavan <rajis@linux.ibm.com>
	Wed, 24 Jun 2020 19:48:15 +0000 (14:48 -0500)
kernel/power/KERNEL.POWER10		patch \| blob \| history
kernel/power/cgemm_kernel_power10.S	[new file with mode: 0644]	patch \| blob
kernel/power/cgemm_logic_power10.S	[new file with mode: 0644]	patch \| blob
kernel/power/cgemm_macros_power10.S	[new file with mode: 0644]	patch \| blob
kernel/power/dgemm_kernel_power10.c	[new file with mode: 0644]	patch \| blob
kernel/power/sgemm_kernel_power10.c	[new file with mode: 0644]	patch \| blob