added dgemm-, dtrmm-, zgemm- and ztrmm-kernel for power8
authorWerner Saar <wernsaar@googlemail.com>
Tue, 1 Mar 2016 06:33:56 +0000 (07:33 +0100)
committerWerner Saar <wernsaar@googlemail.com>
Tue, 1 Mar 2016 06:33:56 +0000 (07:33 +0100)
28 files changed:
common_power.h
cpuid_power.c
getarch.c
kernel/Makefile.L3
kernel/power/KERNEL
kernel/power/KERNEL.POWER8
kernel/power/def_vsx.h [new file with mode: 0644]
kernel/power/dgemm_kernel_16x4_power8.S [new file with mode: 0644]
kernel/power/dgemm_logic_16x4_power8.S [new file with mode: 0644]
kernel/power/dgemm_macros_16x4_power8.S [new file with mode: 0644]
kernel/power/dtrmm_kernel_16x4_power8.S [new file with mode: 0644]
kernel/power/dtrmm_logic_16x4_power8.S [new file with mode: 0644]
kernel/power/gemm_ncopy_4.S
kernel/power/gemm_tcopy_4.S
kernel/power/gemv_n.S
kernel/power/gemv_t.S
kernel/power/symv_L.S
kernel/power/symv_U.S
kernel/power/zgemm_kernel_8x2_power8.S [new file with mode: 0644]
kernel/power/zgemm_logic_8x2_power8.S [new file with mode: 0644]
kernel/power/zgemm_macros_8x2_power8.S [new file with mode: 0644]
kernel/power/zgemv_n.S
kernel/power/zgemv_t.S
kernel/power/zsymv_L.S
kernel/power/zsymv_U.S
kernel/power/ztrmm_kernel_8x2_power8.S [new file with mode: 0644]
kernel/power/ztrmm_logic_8x2_power8.S [new file with mode: 0644]
param.h

index ab331b0..64e052f 100644 (file)
@@ -236,7 +236,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
 #define HAVE_PREFETCH
 #endif
 
-#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL)
+#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8)
 #define DCBT_ARG       0
 #else
 #define DCBT_ARG       8
@@ -258,6 +258,13 @@ static inline int blas_quickdivide(blasint x, blasint y){
 #define L1_PREFETCH    dcbtst
 #endif
 
+#if defined(POWER8)
+#define L1_DUALFETCH
+#define L1_PREFETCHSIZE (16 + 128 * 100)
+#define L1_PREFETCH    dcbtst
+#endif
+
+#
 #ifndef L1_PREFETCH
 #define L1_PREFETCH    dcbt
 #endif
index 6790076..951204a 100644 (file)
@@ -66,7 +66,7 @@ char *cpuname[] = {
   "POWER6",
   "CELL",
   "PPCG4",
-  "POWER8",
+  "POWER8"
 };
 
 char *lowercpuname[] = {
@@ -78,7 +78,7 @@ char *lowercpuname[] = {
   "power6",
   "cell",
   "ppcg4",
-  "power8",
+  "power8"
 };
 
 char *corename[] = {
@@ -90,7 +90,7 @@ char *corename[] = {
   "POWER6",
   "CELL",
   "PPCG4",
-  "POWER8",
+  "POWER8"
 };
 
 int detect(void){
index ff607a4..f9c49e6 100644 (file)
--- a/getarch.c
+++ b/getarch.c
@@ -552,7 +552,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CORENAME  "POWER5"
 #endif
 
-#if defined(FORCE_POWER6) || defined(FORCE_POWER7) || defined(FORCE_POWER8)
+#if defined(FORCE_POWER6) || defined(FORCE_POWER7)
 #define FORCE
 #define ARCHITECTURE    "POWER"
 #define SUBARCHITECTURE "POWER6"
@@ -565,7 +565,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CORENAME  "POWER6"
 #endif
 
-#if defined(FORCE_POWER8)
+#if defined(FORCE_POWER8) 
 #define FORCE
 #define ARCHITECTURE    "POWER"
 #define SUBARCHITECTURE "POWER8"
@@ -578,6 +578,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CORENAME  "POWER8"
 #endif
 
+
 #ifdef FORCE_PPCG4
 #define FORCE
 #define ARCHITECTURE    "POWER"
index 63e675b..8e68274 100644 (file)
@@ -36,6 +36,11 @@ ifeq ($(CORE), HASWELL)
 USE_TRMM = 1
 endif
 
+ifeq ($(CORE), POWER8)
+USE_TRMM = 1
+endif
+
+
 
 
 SKERNELOBJS    += \
index cb9ed84..eae60cd 100644 (file)
@@ -1,57 +1,3 @@
-SGEMM_BETA = gemm_beta.S
-DGEMM_BETA = gemm_beta.S
-CGEMM_BETA = zgemm_beta.S
-ZGEMM_BETA = zgemm_beta.S
-
-
-ifndef SSYMV_U_KERNEL
-SSYMV_U_KERNEL =  symv_U.S
-endif
-
-ifndef SSYMV_L_KERNEL
-SSYMV_L_KERNEL =  symv_L.S
-endif
-
-ifndef DSYMV_U_KERNEL
-DSYMV_U_KERNEL =  symv_U.S
-endif
-
-ifndef DSYMV_L_KERNEL
-DSYMV_L_KERNEL =  symv_L.S
-endif
-
-ifndef CSYMV_U_KERNEL
-CSYMV_U_KERNEL =  zsymv_U.S
-endif
-
-ifndef CSYMV_L_KERNEL
-CSYMV_L_KERNEL =  zsymv_L.S
-endif
-
-ifndef ZSYMV_U_KERNEL
-ZSYMV_U_KERNEL =  zsymv_U.S
-endif
-
-ifndef ZSYMV_L_KERNEL
-ZSYMV_L_KERNEL =  zsymv_L.S
-endif
-
-ifndef CHEMV_U_KERNEL
-CHEMV_U_KERNEL =  zsymv_U.S
-endif
-
-ifndef CHEMV_L_KERNEL
-CHEMV_L_KERNEL =  zsymv_L.S
-endif
-
-ifndef ZHEMV_U_KERNEL
-ZHEMV_U_KERNEL =  zsymv_U.S
-endif
-
-ifndef ZHEMV_L_KERNEL
-ZHEMV_L_KERNEL =  zsymv_L.S
-endif
-
 ifndef STRSMKERNEL_LN
 STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
 endif
index 344b205..3a627e4 100644 (file)
-SGEMMKERNEL    =  gemm_kernel_power6.S
-SGEMMINCOPY    =
-SGEMMITCOPY    =
-SGEMMONCOPY    =  gemm_ncopy_4.S
-SGEMMOTCOPY    =  gemm_tcopy_4.S
-SGEMMINCOPYOBJ =
-SGEMMITCOPYOBJ =
-SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
-SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
-DGEMMKERNEL    =  gemm_kernel_power6.S
-DGEMMINCOPY    =
-DGEMMITCOPY    =
+SGEMM_BETA = ../generic/gemm_beta.c
+DGEMM_BETA = ../generic/gemm_beta.c
+CGEMM_BETA = ../generic/zgemm_beta.c
+ZGEMM_BETA = ../generic/zgemm_beta.c
+
+STRMMKERNEL    = ../generic/trmmkernel_2x2.c
+DTRMMKERNEL    = dtrmm_kernel_16x4_power8.S
+CTRMMKERNEL    = ../generic/ztrmmkernel_2x2.c
+ZTRMMKERNEL    = ztrmm_kernel_8x2_power8.S
+
+SGEMMKERNEL    =  ../generic/gemmkernel_2x2.c
+SGEMMONCOPY    =  ../generic/gemm_ncopy_2.c
+SGEMMOTCOPY    =  ../generic/gemm_tcopy_2.c
+SGEMMONCOPYOBJ =  sgemm_oncopy.o
+SGEMMOTCOPYOBJ =  sgemm_otcopy.o
+
+DGEMMKERNEL    =  dgemm_kernel_16x4_power8.S
+DGEMMINCOPY    = ../generic/gemm_ncopy_16.c
+DGEMMITCOPY    = ../generic/gemm_tcopy_16.c
 DGEMMONCOPY    =  gemm_ncopy_4.S
 DGEMMOTCOPY    =  gemm_tcopy_4.S
-DGEMMINCOPYOBJ =
-DGEMMITCOPYOBJ =
-DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
-DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
-CGEMMKERNEL    =  zgemm_kernel_power6.S
-CGEMMINCOPY    =  ../generic/zgemm_ncopy_2.c
-CGEMMITCOPY    =  ../generic/zgemm_tcopy_2.c
-CGEMMONCOPY    =  ../generic/zgemm_ncopy_4.c
-CGEMMOTCOPY    =  ../generic/zgemm_tcopy_4.c
-CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
-CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
-CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
-CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
-ZGEMMKERNEL    =  zgemm_kernel_power6.S
-ZGEMMINCOPY    =  ../generic/zgemm_ncopy_2.c
-ZGEMMITCOPY    =  ../generic/zgemm_tcopy_2.c
-ZGEMMONCOPY    =  ../generic/zgemm_ncopy_4.c
-ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_4.c
-ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
-ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
-ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
-ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-STRSMKERNEL_LN =  trsm_kernel_power6_LN.S
-STRSMKERNEL_LT =  trsm_kernel_power6_LT.S
-STRSMKERNEL_RN =  trsm_kernel_power6_LT.S
-STRSMKERNEL_RT =  trsm_kernel_power6_RT.S
-
-DTRSMKERNEL_LN =  trsm_kernel_power6_LN.S
-DTRSMKERNEL_LT =  trsm_kernel_power6_LT.S
-DTRSMKERNEL_RN =  trsm_kernel_power6_LT.S
-DTRSMKERNEL_RT =  trsm_kernel_power6_RT.S
-
-CTRSMKERNEL_LN =  ztrsm_kernel_power6_LN.S
-CTRSMKERNEL_LT =  ztrsm_kernel_power6_LT.S
-CTRSMKERNEL_RN =  ztrsm_kernel_power6_LT.S
-CTRSMKERNEL_RT =  ztrsm_kernel_power6_RT.S
-
-ZTRSMKERNEL_LN =  ztrsm_kernel_power6_LN.S
-ZTRSMKERNEL_LT =  ztrsm_kernel_power6_LT.S
-ZTRSMKERNEL_RN =  ztrsm_kernel_power6_LT.S
-ZTRSMKERNEL_RT =  ztrsm_kernel_power6_RT.S
+DGEMMINCOPYOBJ = dgemm_incopy.o
+DGEMMITCOPYOBJ = dgemm_itcopy.o
+DGEMMONCOPYOBJ = dgemm_oncopy.o
+DGEMMOTCOPYOBJ = dgemm_otcopy.o
+
+CGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
+CGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
+CGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
+CGEMMONCOPYOBJ =  cgemm_oncopy.o
+CGEMMOTCOPYOBJ =  cgemm_otcopy.o
+
+ZGEMMKERNEL    = zgemm_kernel_8x2_power8.S
+ZGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
+ZGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
+ZGEMMINCOPY    = ../generic/zgemm_ncopy_8.c
+ZGEMMITCOPY    = ../generic/zgemm_tcopy_8.c
+ZGEMMONCOPYOBJ =  zgemm_oncopy.o
+ZGEMMOTCOPYOBJ =  zgemm_otcopy.o
+ZGEMMINCOPYOBJ =  zgemm_incopy.o
+ZGEMMITCOPYOBJ =  zgemm_itcopy.o
+
+STRSMKERNEL_LN =  ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT =  ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN =  ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT =  ../generic/trsm_kernel_RT.c
+
+DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+
+CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+
+ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+
+#Todo: CGEMM3MKERNEL should be 4x4 blocksizes.
+CGEMM3MKERNEL    =  zgemm3m_kernel_8x4_sse3.S
+ZGEMM3MKERNEL    =  zgemm3m_kernel_4x4_sse3.S
+
+#Pure C for other kernels
+SAMAXKERNEL  = ../arm/amax.c
+DAMAXKERNEL  = ../arm/amax.c
+CAMAXKERNEL  = ../arm/zamax.c
+ZAMAXKERNEL  = ../arm/zamax.c
+
+SAMINKERNEL  = ../arm/amin.c
+DAMINKERNEL  = ../arm/amin.c
+CAMINKERNEL  = ../arm/zamin.c
+ZAMINKERNEL  = ../arm/zamin.c
+
+SMAXKERNEL   = ../arm/max.c
+DMAXKERNEL   = ../arm/max.c
+
+SMINKERNEL   = ../arm/min.c
+DMINKERNEL   = ../arm/min.c
+
+ISAMAXKERNEL = ../arm/iamax.c
+IDAMAXKERNEL = ../arm/iamax.c
+ICAMAXKERNEL = ../arm/izamax.c
+IZAMAXKERNEL = ../arm/izamax.c
+
+ISAMINKERNEL = ../arm/iamin.c
+IDAMINKERNEL = ../arm/iamin.c
+ICAMINKERNEL = ../arm/izamin.c
+IZAMINKERNEL = ../arm/izamin.c
+
+ISMAXKERNEL  = ../arm/imax.c
+IDMAXKERNEL  = ../arm/imax.c
+
+ISMINKERNEL  = ../arm/imin.c
+IDMINKERNEL  = ../arm/imin.c
+
+SASUMKERNEL  = ../arm/asum.c
+DASUMKERNEL  = ../arm/asum.c
+CASUMKERNEL  = ../arm/zasum.c
+ZASUMKERNEL  = ../arm/zasum.c
+
+SAXPYKERNEL  = ../arm/axpy.c
+DAXPYKERNEL  = ../arm/axpy.c
+CAXPYKERNEL  = ../arm/zaxpy.c
+ZAXPYKERNEL  = ../arm/zaxpy.c
+
+SCOPYKERNEL  = ../arm/copy.c
+DCOPYKERNEL  = ../arm/copy.c
+CCOPYKERNEL  = ../arm/zcopy.c
+ZCOPYKERNEL  = ../arm/zcopy.c
+
+SDOTKERNEL   = ../arm/dot.c
+DDOTKERNEL   = ../arm/dot.c
+CDOTKERNEL   = ../arm/zdot.c
+ZDOTKERNEL   = ../arm/zdot.c
+
+SNRM2KERNEL  = ../arm/nrm2.c
+DNRM2KERNEL  = ../arm/nrm2.c
+CNRM2KERNEL  = ../arm/znrm2.c
+ZNRM2KERNEL  = ../arm/znrm2.c
+
+SROTKERNEL   = ../arm/rot.c
+DROTKERNEL   = ../arm/rot.c
+CROTKERNEL   = ../arm/zrot.c
+ZROTKERNEL   = ../arm/zrot.c
+
+SSCALKERNEL  = ../arm/scal.c
+DSCALKERNEL  = ../arm/scal.c
+CSCALKERNEL  = ../arm/zscal.c
+ZSCALKERNEL  = ../arm/zscal.c
+
+SSWAPKERNEL  = ../arm/swap.c
+DSWAPKERNEL  = ../arm/swap.c
+CSWAPKERNEL  = ../arm/zswap.c
+ZSWAPKERNEL  = ../arm/zswap.c
+
+SGEMVNKERNEL = ../arm/gemv_n.c
+DGEMVNKERNEL = ../arm/gemv_n.c
+CGEMVNKERNEL = ../arm/zgemv_n.c
+ZGEMVNKERNEL = ../arm/zgemv_n.c
+
+SGEMVTKERNEL = ../arm/gemv_t.c
+DGEMVTKERNEL = ../arm/gemv_t.c
+CGEMVTKERNEL = ../arm/zgemv_t.c
+ZGEMVTKERNEL = ../arm/zgemv_t.c
+
+SSYMV_U_KERNEL =  ../generic/symv_k.c
+SSYMV_L_KERNEL =  ../generic/symv_k.c
+DSYMV_U_KERNEL =  ../generic/symv_k.c
+DSYMV_L_KERNEL =  ../generic/symv_k.c
+QSYMV_U_KERNEL =  ../generic/symv_k.c
+QSYMV_L_KERNEL =  ../generic/symv_k.c
+CSYMV_U_KERNEL =  ../generic/zsymv_k.c
+CSYMV_L_KERNEL =  ../generic/zsymv_k.c
+ZSYMV_U_KERNEL =  ../generic/zsymv_k.c
+ZSYMV_L_KERNEL =  ../generic/zsymv_k.c
+XSYMV_U_KERNEL =  ../generic/zsymv_k.c
+XSYMV_L_KERNEL =  ../generic/zsymv_k.c
+
+ZHEMV_U_KERNEL =  ../generic/zhemv_k.c
+ZHEMV_L_KERNEL =  ../generic/zhemv_k.c
+
+LSAME_KERNEL = ../generic/lsame.c
+SCABS_KERNEL   = ../generic/cabs.c
+DCABS_KERNEL   = ../generic/cabs.c
+QCABS_KERNEL   = ../generic/cabs.c
+
+#Dump kernel
+CGEMM3MKERNEL    = ../generic/zgemm3mkernel_dump.c
+ZGEMM3MKERNEL    = ../generic/zgemm3mkernel_dump.c
diff --git a/kernel/power/def_vsx.h b/kernel/power/def_vsx.h
new file mode 100644 (file)
index 0000000..c2d29e2
--- /dev/null
@@ -0,0 +1,64 @@
+#define vs0 0
+#define vs1 1
+#define vs2 2
+#define vs3 3
+#define vs4 4
+#define vs5 5
+#define vs6 6
+#define vs7 7
+#define vs8 8
+#define vs9 9
+#define vs10 10
+#define vs11 11
+#define vs12 12
+#define vs13 13
+#define vs14 14
+#define vs15 15
+#define vs16 16
+#define vs17 17
+#define vs18 18
+#define vs19 19
+#define vs20 20
+#define vs21 21
+#define vs22 22
+#define vs23 23
+#define vs24 24
+#define vs25 25
+#define vs26 26
+#define vs27 27
+#define vs28 28
+#define vs29 29
+#define vs30 30
+#define vs31 31
+#define vs32 32
+#define vs33 33
+#define vs34 34
+#define vs35 35
+#define vs36 36
+#define vs37 37
+#define vs38 38
+#define vs39 39
+#define vs40 40
+#define vs41 41
+#define vs42 42
+#define vs43 43
+#define vs44 44
+#define vs45 45
+#define vs46 46
+#define vs47 47
+#define vs48 48
+#define vs49 49
+#define vs50 50
+#define vs51 51
+#define vs52 52
+#define vs53 53
+#define vs54 54
+#define vs55 55
+#define vs56 56
+#define vs57 57
+#define vs58 58
+#define vs59 59
+#define vs60 60
+#define vs61 61
+#define vs62 62
+#define vs63 63
diff --git a/kernel/power/dgemm_kernel_16x4_power8.S b/kernel/power/dgemm_kernel_16x4_power8.S
new file mode 100644 (file)
index 0000000..53205ad
--- /dev/null
@@ -0,0 +1,313 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+#ifndef __64BIT__
+#define LOAD   lwz
+#else
+#define LOAD   ld
+#endif
+
+#ifdef __64BIT__
+#define STACKSIZE 320
+#define ALPHA_SP   296(SP)
+#define FZERO  304(SP)
+#else
+#define STACKSIZE 240
+#define ALPHA_SP   224(SP)
+#define FZERO  232(SP)
+#endif
+
+#define        M       r3
+#define        N       r4
+#define        K       r5
+
+#ifdef linux
+#ifndef __64BIT__
+#define A      r6
+#define        B       r7
+#define        C       r8
+#define        LDC     r9
+#define OFFSET r10
+#else
+#define A      r7
+#define        B       r8
+#define        C       r9
+#define        LDC     r10
+#define OFFSET r6
+#endif
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#if !defined(__64BIT__) && defined(DOUBLE)
+#define A      r8
+#define        B       r9
+#define        C       r10
+#define        LDC     r7
+#define OFFSET r6
+#else
+#define A      r7
+#define        B       r8
+#define        C       r9
+#define        LDC     r10
+#define OFFSET r6
+#endif
+#endif
+
+#define alpha_r vs18
+
+#define o0     0
+
+#define o8     r15
+#define o24    r16
+#define ALPHA  r17
+#define L      r18
+#define T1     r19
+#define KK     r20
+#define BB     r21
+#define        I       r22
+#define J      r23
+#define AO     r24
+#define        BO      r25
+#define        CO      r26
+#define o16    r27
+#define        o32     r28
+#define        o48     r29
+
+#define PRE    r30
+#define T2     r31
+
+#include "dgemm_macros_16x4_power8.S"
+
+
+#ifndef NEEDPARAM
+
+       PROLOGUE
+       PROFCODE
+
+       addi    SP, SP, -STACKSIZE
+       li      r0, 0
+
+       stfd    f14,    0(SP)
+       stfd    f15,    8(SP)
+       stfd    f16,   16(SP)
+       stfd    f17,   24(SP)
+
+       stfd    f18,   32(SP)
+       stfd    f19,   40(SP)
+       stfd    f20,   48(SP)
+       stfd    f21,   56(SP)
+
+       stfd    f22,   64(SP)
+       stfd    f23,   72(SP)
+       stfd    f24,   80(SP)
+       stfd    f25,   88(SP)
+
+       stfd    f26,   96(SP)
+       stfd    f27,  104(SP)
+       stfd    f28,  112(SP)
+       stfd    f29,  120(SP)
+
+       stfd    f30,  128(SP)
+       stfd    f31,  136(SP)
+
+#ifdef __64BIT__
+       std     r31,  144(SP)
+       std     r30,  152(SP)
+       std     r29,  160(SP)
+       std     r28,  168(SP)
+       std     r27,  176(SP)
+       std     r26,  184(SP)
+       std     r25,  192(SP)
+       std     r24,  200(SP)
+       std     r23,  208(SP)
+       std     r22,  216(SP)
+       std     r21,  224(SP)
+       std     r20,  232(SP)
+       std     r19,  240(SP)
+       std     r18,  248(SP)
+       std     r17,  256(SP)
+       std     r16,  264(SP)
+       std     r15,  272(SP)
+#else
+       stw     r31,  144(SP)
+       stw     r30,  148(SP)
+       stw     r29,  152(SP)
+       stw     r28,  156(SP)
+       stw     r27,  160(SP)
+       stw     r26,  164(SP)
+       stw     r25,  168(SP)
+       stw     r24,  172(SP)
+       stw     r23,  176(SP)
+       stw     r22,  180(SP)
+       stw     r21,  184(SP)
+       stw     r20,  188(SP)
+       stw     r19,  192(SP)
+       stw     r18,  196(SP)
+       stw     r17,  200(SP)
+       stw     r16,  204(SP)
+       stw     r15,  208(SP)
+#endif
+
+       stfd    f1,  ALPHA_SP
+       stw     r0,  FZERO
+
+#if defined(_AIX) || defined(__APPLE__)
+#if !defined(__64BIT__) && defined(DOUBLE)
+       lwz     LDC, FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+#endif
+
+       slwi    LDC, LDC, BASE_SHIFT
+
+#if defined(TRMMKERNEL)
+#if defined(linux) && defined(__64BIT__)
+       ld      OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#ifdef __64BIT__
+       ld      OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
+#else
+#ifdef DOUBLE
+       lwz     OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
+#else
+       lwz     OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+#endif
+#endif
+#endif
+
+
+       cmpwi   cr0, M, 0
+       ble     L999_H1
+       cmpwi   cr0, N, 0
+       ble     L999_H1
+       cmpwi   cr0, K, 0
+       ble     L999_H1
+
+#ifdef __64BIT__
+       addi    ALPHA, SP, 296
+#else
+       addi    ALPHA, SP, 224
+#endif
+
+       li      PRE, 256 
+       li      o8 , 8
+       li      o16, 16
+       li      o24, 24
+       li      o32, 32
+       li      o48, 48
+
+       lxvdsx  alpha_r, 0, ALPHA
+
+#include "dgemm_logic_16x4_power8.S"
+
+L999:
+       addi    r3, 0, 0
+
+       lfd     f14,    0(SP)
+       lfd     f15,    8(SP)
+       lfd     f16,   16(SP)
+       lfd     f17,   24(SP)
+
+       lfd     f18,   32(SP)
+       lfd     f19,   40(SP)
+       lfd     f20,   48(SP)
+       lfd     f21,   56(SP)
+
+       lfd     f22,   64(SP)
+       lfd     f23,   72(SP)
+       lfd     f24,   80(SP)
+       lfd     f25,   88(SP)
+
+       lfd     f26,   96(SP)
+       lfd     f27,  104(SP)
+       lfd     f28,  112(SP)
+       lfd     f29,  120(SP)
+
+       lfd     f30,  128(SP)
+       lfd     f31,  136(SP)
+
+#ifdef __64BIT__
+       ld      r31,  144(SP)
+       ld      r30,  152(SP)
+       ld      r29,  160(SP)
+       ld      r28,  168(SP)
+       ld      r27,  176(SP)
+       ld      r26,  184(SP)
+       ld      r25,  192(SP)
+       ld      r24,  200(SP)
+       ld      r23,  208(SP)
+       ld      r22,  216(SP)
+       ld      r21,  224(SP)
+       ld      r20,  232(SP)
+       ld      r19,  240(SP)
+       ld      r18,  248(SP)
+       ld      r17,  256(SP)
+       ld      r16,  264(SP)
+       ld      r15,  272(SP)
+#else
+       lwz     r31,  144(SP)
+       lwz     r30,  148(SP)
+       lwz     r29,  152(SP)
+       lwz     r28,  156(SP)
+       lwz     r27,  160(SP)
+       lwz     r26,  164(SP)
+       lwz     r25,  168(SP)
+       lwz     r24,  172(SP)
+       lwz     r23,  176(SP)
+       lwz     r22,  180(SP)
+       lwz     r21,  184(SP)
+       lwz     r20,  188(SP)
+       lwz     r19,  192(SP)
+       lwz     r18,  196(SP)
+       lwz     r17,  200(SP)
+       lwz     r16,  204(SP)
+       lwz     r15,  208(SP)
+#endif
+
+       addi    SP, SP, STACKSIZE
+
+       blr
+
+       EPILOGUE
+#endif
diff --git a/kernel/power/dgemm_logic_16x4_power8.S b/kernel/power/dgemm_logic_16x4_power8.S
new file mode 100644 (file)
index 0000000..e19f78b
--- /dev/null
@@ -0,0 +1,1647 @@
+       srawi.          J,      N,      2
+       ble             DGEMM_L4_END
+
+DGEMM_L4_BEGIN:
+
+       mr              CO,     C
+       mr              AO,     A
+       slwi            T1,     LDC     ,       2
+       add             C,      C,      T1
+       srawi.          I,      M,      4
+       ble             DGEMM_L4x16_END
+
+DGEMM_L4x16_BEGIN:
+
+
+       mr              BO,     B
+       srawi.          L,      K,      3
+       ble             DGEMM_L4x16_SUB0
+       cmpwi           cr0,    L,      1
+       ble             DGEMM_L4x16_SUB4
+
+DGEMM_L4x16_LOOP_START:
+
+       dcbt            AO,     PRE
+       LOAD4x16_1
+       dcbt            AO,     PRE
+       KERNEL4x16_I1
+       dcbt            AO,     PRE
+       KERNEL4x16_2
+       dcbt            AO,     PRE
+       KERNEL4x16_1
+       dcbt            AO,     PRE
+       KERNEL4x16_2
+
+       dcbt            AO,     PRE
+       KERNEL4x16_1
+       dcbt            AO,     PRE
+       KERNEL4x16_2
+       dcbt            AO,     PRE
+       KERNEL4x16_1
+       dcbt            AO,     PRE
+       KERNEL4x16_2
+
+       addic.          L,      L,      -2
+       ble             DGEMM_L4x16_LOOP_END
+
+       .align 5
+
+DGEMM_L4x16_LOOP:
+
+       dcbt            AO,     PRE
+       KERNEL4x16_1
+       dcbt            AO,     PRE
+       KERNEL4x16_2
+       dcbt            AO,     PRE
+       KERNEL4x16_1
+       dcbt            AO,     PRE
+       KERNEL4x16_2
+
+       dcbt            AO,     PRE
+       KERNEL4x16_1
+       dcbt            AO,     PRE
+       KERNEL4x16_2
+       dcbt            AO,     PRE
+       KERNEL4x16_1
+       dcbt            AO,     PRE
+       KERNEL4x16_2
+
+       addic.          L,      L,      -1
+       bgt             DGEMM_L4x16_LOOP
+
+DGEMM_L4x16_LOOP_END:
+
+       dcbt            AO,     PRE
+       KERNEL4x16_1
+       dcbt            AO,     PRE
+       KERNEL4x16_2
+       dcbt            AO,     PRE
+       KERNEL4x16_1
+       dcbt            AO,     PRE
+       KERNEL4x16_2
+
+       dcbt            AO,     PRE
+       KERNEL4x16_1
+       dcbt            AO,     PRE
+       KERNEL4x16_2
+       dcbt            AO,     PRE
+       KERNEL4x16_1
+       KERNEL4x16_E2
+
+       b               DGEMM_L4x16_SUB1
+
+DGEMM_L4x16_SUB4:
+
+       dcbt            AO,     PRE
+       KERNEL4x16_SUBI1
+       dcbt            AO,     PRE
+       KERNEL4x16_SUB1
+       dcbt            AO,     PRE
+       KERNEL4x16_SUB1
+       dcbt            AO,     PRE
+       KERNEL4x16_SUB1
+
+       KERNEL4x16_SUB1
+       KERNEL4x16_SUB1
+       KERNEL4x16_SUB1
+       KERNEL4x16_SUB1
+
+       b               DGEMM_L4x16_SUB1
+
+DGEMM_L4x16_SUB0:
+
+       andi.           L,      K,      7
+
+       KERNEL4x16_SUBI1
+
+       addic.          L,      L,      -1
+       ble             DGEMM_L4x16_SAVE
+       b               DGEMM_L4x16_SUB2
+
+DGEMM_L4x16_SUB1:
+
+       andi.           L,      K,      7
+       ble             DGEMM_L4x16_SAVE
+
+DGEMM_L4x16_SUB2:
+
+       KERNEL4x16_SUB1
+
+       addic.          L,      L,      -1
+       bgt             DGEMM_L4x16_SUB2
+
+DGEMM_L4x16_SAVE:
+
+       SAVE4x16
+
+       addic.          I,      I,      -1
+       bgt             DGEMM_L4x16_BEGIN
+
+DGEMM_L4x16_END:
+
+DGEMM_L4x8_BEGIN:
+
+       andi.           T2,     M,      15
+       ble             DGEMM_L4x1_END
+
+       andi.           T1,     M,      8
+       ble             DGEMM_L4x8_END
+       mr              BO,     B
+       srawi.          L,      K,      3
+       ble             DGEMM_L4x8_SUB0
+       cmpwi           cr0,    L,      1
+       ble             DGEMM_L4x8_SUB4
+
+DGEMM_L4x8_LOOP_START:
+
+       LOAD4x8_1
+       KERNEL4x8_I1
+       KERNEL4x8_2
+       KERNEL4x8_1
+       KERNEL4x8_2
+
+       KERNEL4x8_1
+       KERNEL4x8_2
+       KERNEL4x8_1
+       KERNEL4x8_2
+
+       addic.          L,      L,      -2
+       ble             DGEMM_L4x8_LOOP_END
+
+       .align 5
+
+DGEMM_L4x8_LOOP:
+
+       KERNEL4x8_1
+       KERNEL4x8_2
+       KERNEL4x8_1
+       KERNEL4x8_2
+
+       KERNEL4x8_1
+       KERNEL4x8_2
+       KERNEL4x8_1
+       KERNEL4x8_2
+
+       addic.          L,      L,      -1
+       bgt             DGEMM_L4x8_LOOP
+
+DGEMM_L4x8_LOOP_END:
+
+       KERNEL4x8_1
+       KERNEL4x8_2
+       KERNEL4x8_1
+       KERNEL4x8_2
+
+       KERNEL4x8_1
+       KERNEL4x8_2
+       KERNEL4x8_1
+       KERNEL4x8_E2
+
+       b               DGEMM_L4x8_SUB1
+
+DGEMM_L4x8_SUB4:
+
+       KERNEL4x8_SUBI1
+       KERNEL4x8_SUB1
+       KERNEL4x8_SUB1
+       KERNEL4x8_SUB1
+
+       KERNEL4x8_SUB1
+       KERNEL4x8_SUB1
+       KERNEL4x8_SUB1
+       KERNEL4x8_SUB1
+
+       b               DGEMM_L4x8_SUB1
+
+DGEMM_L4x8_SUB0:
+
+       andi.           L,      K,      7
+
+       KERNEL4x8_SUBI1
+
+       addic.          L,      L,      -1
+       ble             DGEMM_L4x8_SAVE
+       b               DGEMM_L4x8_SUB2
+
+DGEMM_L4x8_SUB1:
+
+       andi.           L,      K,      7
+       ble             DGEMM_L4x8_SAVE
+
+DGEMM_L4x8_SUB2:
+
+       KERNEL4x8_SUB1
+
+       addic.          L,      L,      -1
+       bgt             DGEMM_L4x8_SUB2
+
+DGEMM_L4x8_SAVE:
+
+       SAVE4x8
+
+DGEMM_L4x8_END:
+
+DGEMM_L4x4_BEGIN:
+
+
+       andi.           T1,     M,      4
+       ble             DGEMM_L4x4_END
+       mr              BO,     B
+       srawi.          L,      K,      3
+       ble             DGEMM_L4x4_SUB0
+       cmpwi           cr0,    L,      1
+       ble             DGEMM_L4x4_SUB4
+
+DGEMM_L4x4_LOOP_START:
+
+       LOAD4x4_1
+       KERNEL4x4_I1
+       KERNEL4x4_2
+       KERNEL4x4_1
+       KERNEL4x4_2
+
+       KERNEL4x4_1
+       KERNEL4x4_2
+       KERNEL4x4_1
+       KERNEL4x4_2
+
+       addic.          L,      L,      -2
+       ble             DGEMM_L4x4_LOOP_END
+
+       .align 5
+
+DGEMM_L4x4_LOOP:
+
+       KERNEL4x4_1
+       KERNEL4x4_2
+       KERNEL4x4_1
+       KERNEL4x4_2
+
+       KERNEL4x4_1
+       KERNEL4x4_2
+       KERNEL4x4_1
+       KERNEL4x4_2
+
+       addic.          L,      L,      -1
+       bgt             DGEMM_L4x4_LOOP
+
+DGEMM_L4x4_LOOP_END:
+
+       KERNEL4x4_1
+       KERNEL4x4_2
+       KERNEL4x4_1
+       KERNEL4x4_2
+
+       KERNEL4x4_1
+       KERNEL4x4_2
+       KERNEL4x4_1
+       KERNEL4x4_E2
+
+       b               DGEMM_L4x4_SUB1
+
+DGEMM_L4x4_SUB4:
+
+       KERNEL4x4_SUBI1
+       KERNEL4x4_SUB1
+       KERNEL4x4_SUB1
+       KERNEL4x4_SUB1
+
+       KERNEL4x4_SUB1
+       KERNEL4x4_SUB1
+       KERNEL4x4_SUB1
+       KERNEL4x4_SUB1
+
+       b               DGEMM_L4x4_SUB1
+
+DGEMM_L4x4_SUB0:
+
+       andi.           L,      K,      7
+
+       KERNEL4x4_SUBI1
+
+       addic.          L,      L,      -1
+       ble             DGEMM_L4x4_SAVE
+       b               DGEMM_L4x4_SUB2
+
+DGEMM_L4x4_SUB1:
+
+       andi.           L,      K,      7
+       ble             DGEMM_L4x4_SAVE
+
+DGEMM_L4x4_SUB2:
+
+       KERNEL4x4_SUB1
+
+       addic.          L,      L,      -1
+       bgt             DGEMM_L4x4_SUB2
+
+DGEMM_L4x4_SAVE:
+
+       SAVE4x4
+
+DGEMM_L4x4_END:
+
+DGEMM_L4x2_BEGIN:
+
+
+       andi.           T1,     M,      2
+       ble             DGEMM_L4x2_END
+       mr              BO,     B
+       srawi.          L,      K,      3
+       ble             DGEMM_L4x2_SUB0
+       cmpwi           cr0,    L,      1
+       ble             DGEMM_L4x2_SUB4
+
+DGEMM_L4x2_LOOP_START:
+
+       LOAD4x2_1
+       KERNEL4x2_I1
+       KERNEL4x2_2
+       KERNEL4x2_1
+       KERNEL4x2_2
+
+       KERNEL4x2_1
+       KERNEL4x2_2
+       KERNEL4x2_1
+       KERNEL4x2_2
+
+       addic.          L,      L,      -2
+       ble             DGEMM_L4x2_LOOP_END
+
+       .align 5
+
+DGEMM_L4x2_LOOP:
+
+       KERNEL4x2_1
+       KERNEL4x2_2
+       KERNEL4x2_1
+       KERNEL4x2_2
+
+       KERNEL4x2_1
+       KERNEL4x2_2
+       KERNEL4x2_1
+       KERNEL4x2_2
+
+       addic.          L,      L,      -1
+       bgt             DGEMM_L4x2_LOOP
+
+DGEMM_L4x2_LOOP_END:
+
+       KERNEL4x2_1
+       KERNEL4x2_2
+       KERNEL4x2_1
+       KERNEL4x2_2
+
+       KERNEL4x2_1
+       KERNEL4x2_2
+       KERNEL4x2_1
+       KERNEL4x2_E2
+
+       b               DGEMM_L4x2_SUB1
+
+DGEMM_L4x2_SUB4:
+
+       KERNEL4x2_SUBI1
+       KERNEL4x2_SUB1
+       KERNEL4x2_SUB1
+       KERNEL4x2_SUB1
+
+       KERNEL4x2_SUB1
+       KERNEL4x2_SUB1
+       KERNEL4x2_SUB1
+       KERNEL4x2_SUB1
+
+       b               DGEMM_L4x2_SUB1
+
+DGEMM_L4x2_SUB0:
+
+       andi.           L,      K,      7
+
+       KERNEL4x2_SUBI1
+
+       addic.          L,      L,      -1
+       ble             DGEMM_L4x2_SAVE
+       b               DGEMM_L4x2_SUB2
+
+DGEMM_L4x2_SUB1:
+
+       andi.           L,      K,      7
+       ble             DGEMM_L4x2_SAVE
+
+DGEMM_L4x2_SUB2:
+
+       KERNEL4x2_SUB1
+
+       addic.          L,      L,      -1
+       bgt             DGEMM_L4x2_SUB2
+
+DGEMM_L4x2_SAVE:
+
+       SAVE4x2
+
+DGEMM_L4x2_END:
+
+DGEMM_L4x1_BEGIN:
+
+
+       andi.           T1,     M,      1
+       ble             DGEMM_L4x1_END
+       mr              BO,     B
+       srawi.          L,      K,      3
+       ble             DGEMM_L4x1_SUB0
+       cmpwi           cr0,    L,      1
+       ble             DGEMM_L4x1_SUB4
+
+DGEMM_L4x1_LOOP_START:
+
+       LOAD4x1_1
+       KERNEL4x1_I1
+       KERNEL4x1_2
+       KERNEL4x1_1
+       KERNEL4x1_2
+
+       KERNEL4x1_1
+       KERNEL4x1_2
+       KERNEL4x1_1
+       KERNEL4x1_2
+
+       addic.          L,      L,      -2
+       ble             DGEMM_L4x1_LOOP_END
+
+       .align 5
+
+DGEMM_L4x1_LOOP:
+
+       KERNEL4x1_1
+       KERNEL4x1_2
+       KERNEL4x1_1
+       KERNEL4x1_2
+
+       KERNEL4x1_1
+       KERNEL4x1_2
+       KERNEL4x1_1
+       KERNEL4x1_2
+
+       addic.          L,      L,      -1
+       bgt             DGEMM_L4x1_LOOP
+
+DGEMM_L4x1_LOOP_END:
+
+       KERNEL4x1_1
+       KERNEL4x1_2
+       KERNEL4x1_1
+       KERNEL4x1_2
+
+       KERNEL4x1_1
+       KERNEL4x1_2
+       KERNEL4x1_1
+       KERNEL4x1_E2
+
+       b               DGEMM_L4x1_SUB1
+
+DGEMM_L4x1_SUB4:
+
+       KERNEL4x1_SUBI1
+       KERNEL4x1_SUB1
+       KERNEL4x1_SUB1
+       KERNEL4x1_SUB1
+
+       KERNEL4x1_SUB1
+       KERNEL4x1_SUB1
+       KERNEL4x1_SUB1
+       KERNEL4x1_SUB1
+
+       b               DGEMM_L4x1_SUB1
+
+DGEMM_L4x1_SUB0:
+
+       andi.           L,      K,      7
+
+       KERNEL4x1_SUBI1
+
+       addic.          L,      L,      -1
+       ble             DGEMM_L4x1_SAVE
+       b               DGEMM_L4x1_SUB2
+
+DGEMM_L4x1_SUB1:
+
+       andi.           L,      K,      7
+       ble             DGEMM_L4x1_SAVE
+
+DGEMM_L4x1_SUB2:
+
+       KERNEL4x1_SUB1
+
+       addic.          L,      L,      -1
+       bgt             DGEMM_L4x1_SUB2
+
+DGEMM_L4x1_SAVE:
+
+       SAVE4x1
+
+DGEMM_L4x1_END:
+
+       slwi            T1,     K,      5
+       add             B,      B,      T1
+
+       addic.          J,      J,      -1
+       bgt             DGEMM_L4_BEGIN
+
+       andi.           T2,     N,      3
+       ble             L999
+
+DGEMM_L4_END:
+
+       b               DGEMM_L2_BEGIN
+
+L999_H1:
+
+       b               L999
+
+DGEMM_L2_BEGIN:
+
+       andi.           T1,     N,      2
+       ble             DGEMM_L2_END
+       mr              CO,     C
+       mr              AO,     A
+       slwi            T1,     LDC     ,       1
+       add             C,      C,      T1
+       srawi.          I,      M,      4
+       ble             DGEMM_L2x16_END
+
+DGEMM_L2x16_BEGIN:
+
+
+       mr              BO,     B
+       srawi.          L,      K,      3
+       ble             DGEMM_L2x16_SUB0
+       cmpwi           cr0,    L,      1
+       ble             DGEMM_L2x16_SUB4
+
+DGEMM_L2x16_LOOP_START:
+
+       dcbt            AO,     PRE
+       LOAD2x16_1
+       dcbt            AO,     PRE
+       KERNEL2x16_I1
+       dcbt            AO,     PRE
+       KERNEL2x16_2
+       dcbt            AO,     PRE
+       KERNEL2x16_1
+       dcbt            AO,     PRE
+       KERNEL2x16_2
+
+       dcbt            AO,     PRE
+       KERNEL2x16_1
+       dcbt            AO,     PRE
+       KERNEL2x16_2
+       dcbt            AO,     PRE
+       KERNEL2x16_1
+       dcbt            AO,     PRE
+       KERNEL2x16_2
+
+       addic.          L,      L,      -2
+       ble             DGEMM_L2x16_LOOP_END
+
+       .align 5
+
+DGEMM_L2x16_LOOP:
+
+       dcbt            AO,     PRE
+       KERNEL2x16_1
+       dcbt            AO,     PRE
+       KERNEL2x16_2
+       dcbt            AO,     PRE
+       KERNEL2x16_1
+       dcbt            AO,     PRE
+       KERNEL2x16_2
+
+       dcbt            AO,     PRE
+       KERNEL2x16_1
+       dcbt            AO,     PRE
+       KERNEL2x16_2
+       dcbt            AO,     PRE
+       KERNEL2x16_1
+       dcbt            AO,     PRE
+       KERNEL2x16_2
+
+       addic.          L,      L,      -1
+       bgt             DGEMM_L2x16_LOOP
+
+DGEMM_L2x16_LOOP_END:
+
+       dcbt            AO,     PRE
+       KERNEL2x16_1
+       dcbt            AO,     PRE
+       KERNEL2x16_2
+       dcbt            AO,     PRE
+       KERNEL2x16_1
+       dcbt            AO,     PRE
+       KERNEL2x16_2
+
+       dcbt            AO,     PRE
+       KERNEL2x16_1
+       dcbt            AO,     PRE
+       KERNEL2x16_2
+       dcbt            AO,     PRE
+       KERNEL2x16_1
+       KERNEL2x16_E2
+
+       b               DGEMM_L2x16_SUB1
+
+DGEMM_L2x16_SUB4:
+
+       dcbt            AO,     PRE
+       KERNEL2x16_SUBI1
+       dcbt            AO,     PRE
+       KERNEL2x16_SUB1
+       dcbt            AO,     PRE
+       KERNEL2x16_SUB1
+       dcbt            AO,     PRE
+       KERNEL2x16_SUB1
+
+       KERNEL2x16_SUB1
+       KERNEL2x16_SUB1
+       KERNEL2x16_SUB1
+       KERNEL2x16_SUB1
+
+       b               DGEMM_L2x16_SUB1
+
+DGEMM_L2x16_SUB0:
+
+       andi.           L,      K,      7
+
+       KERNEL2x16_SUBI1
+
+       addic.          L,      L,      -1
+       ble             DGEMM_L2x16_SAVE
+       b               DGEMM_L2x16_SUB2
+
+DGEMM_L2x16_SUB1:
+
+       andi.           L,      K,      7
+       ble             DGEMM_L2x16_SAVE
+
+DGEMM_L2x16_SUB2:
+
+       KERNEL2x16_SUB1
+
+       addic.          L,      L,      -1
+       bgt             DGEMM_L2x16_SUB2
+
+DGEMM_L2x16_SAVE:
+
+       SAVE2x16
+
+       addic.          I,      I,      -1
+       bgt             DGEMM_L2x16_BEGIN
+
+DGEMM_L2x16_END:
+
+DGEMM_L2x8_BEGIN:
+
+       andi.           T2,     M,      15
+       ble             DGEMM_L2x1_END
+
+       andi.           T1,     M,      8
+       ble             DGEMM_L2x8_END
+       mr              BO,     B
+       srawi.          L,      K,      3
+       ble             DGEMM_L2x8_SUB0
+       cmpwi           cr0,    L,      1
+       ble             DGEMM_L2x8_SUB4
+
+DGEMM_L2x8_LOOP_START:
+
+       LOAD2x8_1
+       KERNEL2x8_I1
+       KERNEL2x8_2
+       KERNEL2x8_1
+       KERNEL2x8_2
+
+       KERNEL2x8_1
+       KERNEL2x8_2
+       KERNEL2x8_1
+       KERNEL2x8_2
+
+       addic.          L,      L,      -2
+       ble             DGEMM_L2x8_LOOP_END
+
+       .align 5
+
+DGEMM_L2x8_LOOP:
+
+       KERNEL2x8_1
+       KERNEL2x8_2
+       KERNEL2x8_1
+       KERNEL2x8_2
+
+       KERNEL2x8_1
+       KERNEL2x8_2
+       KERNEL2x8_1
+       KERNEL2x8_2
+
+       addic.          L,      L,      -1
+       bgt             DGEMM_L2x8_LOOP
+
+DGEMM_L2x8_LOOP_END:
+
+       KERNEL2x8_1
+       KERNEL2x8_2
+       KERNEL2x8_1
+       KERNEL2x8_2
+
+       KERNEL2x8_1
+       KERNEL2x8_2
+       KERNEL2x8_1
+       KERNEL2x8_E2
+
+       b               DGEMM_L2x8_SUB1
+
+DGEMM_L2x8_SUB4:
+
+       KERNEL2x8_SUBI1
+       KERNEL2x8_SUB1
+       KERNEL2x8_SUB1
+       KERNEL2x8_SUB1
+
+       KERNEL2x8_SUB1
+       KERNEL2x8_SUB1
+       KERNEL2x8_SUB1
+       KERNEL2x8_SUB1
+
+       b               DGEMM_L2x8_SUB1
+
+DGEMM_L2x8_SUB0:
+
+       andi.           L,      K,      7
+
+       KERNEL2x8_SUBI1
+
+       addic.          L,      L,      -1
+       ble             DGEMM_L2x8_SAVE
+       b               DGEMM_L2x8_SUB2
+
+DGEMM_L2x8_SUB1:
+
+       andi.           L,      K,      7
+       ble             DGEMM_L2x8_SAVE
+
+DGEMM_L2x8_SUB2:
+
+       KERNEL2x8_SUB1
+
+       addic.          L,      L,      -1
+       bgt             DGEMM_L2x8_SUB2
+
+DGEMM_L2x8_SAVE:
+
+       SAVE2x8
+
+DGEMM_L2x8_END:
+
+DGEMM_L2x4_BEGIN:
+
+
+       andi.           T1,     M,      4
+       ble             DGEMM_L2x4_END
+       mr              BO,     B
+       srawi.          L,      K,      3
+       ble             DGEMM_L2x4_SUB0
+       cmpwi           cr0,    L,      1
+       ble             DGEMM_L2x4_SUB4
+
+DGEMM_L2x4_LOOP_START:
+
+       LOAD2x4_1
+       KERNEL2x4_I1
+       KERNEL2x4_2
+       KERNEL2x4_1
+       KERNEL2x4_2
+
+       KERNEL2x4_1
+       KERNEL2x4_2
+       KERNEL2x4_1
+       KERNEL2x4_2
+
+       addic.          L,      L,      -2
+       ble             DGEMM_L2x4_LOOP_END
+
+       .align 5
+
+DGEMM_L2x4_LOOP:
+
+       KERNEL2x4_1
+       KERNEL2x4_2
+       KERNEL2x4_1
+       KERNEL2x4_2
+
+       KERNEL2x4_1
+       KERNEL2x4_2
+       KERNEL2x4_1
+       KERNEL2x4_2
+
+       addic.          L,      L,      -1
+       bgt             DGEMM_L2x4_LOOP
+
+DGEMM_L2x4_LOOP_END:
+
+       KERNEL2x4_1
+       KERNEL2x4_2
+       KERNEL2x4_1
+       KERNEL2x4_2
+
+       KERNEL2x4_1
+       KERNEL2x4_2
+       KERNEL2x4_1
+       KERNEL2x4_E2
+
+       b               DGEMM_L2x4_SUB1
+
+DGEMM_L2x4_SUB4:
+
+       KERNEL2x4_SUBI1
+       KERNEL2x4_SUB1
+       KERNEL2x4_SUB1
+       KERNEL2x4_SUB1
+
+       KERNEL2x4_SUB1
+       KERNEL2x4_SUB1
+       KERNEL2x4_SUB1
+       KERNEL2x4_SUB1
+
+       b               DGEMM_L2x4_SUB1
+
+DGEMM_L2x4_SUB0:
+
+       andi.           L,      K,      7
+
+       KERNEL2x4_SUBI1
+
+       addic.          L,      L,      -1
+       ble             DGEMM_L2x4_SAVE
+       b               DGEMM_L2x4_SUB2
+
+DGEMM_L2x4_SUB1:
+
+       andi.           L,      K,      7
+       ble             DGEMM_L2x4_SAVE
+
+DGEMM_L2x4_SUB2:
+
+       KERNEL2x4_SUB1
+
+       addic.          L,      L,      -1
+       bgt             DGEMM_L2x4_SUB2
+
+DGEMM_L2x4_SAVE:
+
+       SAVE2x4
+
+DGEMM_L2x4_END:
+
+DGEMM_L2x2_BEGIN:
+
+
+       andi.           T1,     M,      2
+       ble             DGEMM_L2x2_END
+       mr              BO,     B
+       srawi.          L,      K,      3
+       ble             DGEMM_L2x2_SUB0
+       cmpwi           cr0,    L,      1
+       ble             DGEMM_L2x2_SUB4
+
+DGEMM_L2x2_LOOP_START:
+
+       LOAD2x2_1
+       KERNEL2x2_I1
+       KERNEL2x2_2
+       KERNEL2x2_1
+       KERNEL2x2_2
+
+       KERNEL2x2_1
+       KERNEL2x2_2
+       KERNEL2x2_1
+       KERNEL2x2_2
+
+       addic.          L,      L,      -2
+       ble             DGEMM_L2x2_LOOP_END
+
+       .align 5
+
+DGEMM_L2x2_LOOP:
+
+       KERNEL2x2_1
+       KERNEL2x2_2
+       KERNEL2x2_1
+       KERNEL2x2_2
+
+       KERNEL2x2_1
+       KERNEL2x2_2
+       KERNEL2x2_1
+       KERNEL2x2_2
+
+       addic.          L,      L,      -1
+       bgt             DGEMM_L2x2_LOOP
+
+DGEMM_L2x2_LOOP_END:
+
+       KERNEL2x2_1
+       KERNEL2x2_2
+       KERNEL2x2_1
+       KERNEL2x2_2
+
+       KERNEL2x2_1
+       KERNEL2x2_2
+       KERNEL2x2_1
+       KERNEL2x2_E2
+
+       b               DGEMM_L2x2_SUB1
+
+DGEMM_L2x2_SUB4:
+
+       KERNEL2x2_SUBI1
+       KERNEL2x2_SUB1
+       KERNEL2x2_SUB1
+       KERNEL2x2_SUB1
+
+       KERNEL2x2_SUB1
+       KERNEL2x2_SUB1
+       KERNEL2x2_SUB1
+       KERNEL2x2_SUB1
+
+       b               DGEMM_L2x2_SUB1
+
+DGEMM_L2x2_SUB0:
+
+       andi.           L,      K,      7
+
+       KERNEL2x2_SUBI1
+
+       addic.          L,      L,      -1
+       ble             DGEMM_L2x2_SAVE
+       b               DGEMM_L2x2_SUB2
+
+DGEMM_L2x2_SUB1:
+
+       andi.           L,      K,      7
+       ble             DGEMM_L2x2_SAVE
+
+DGEMM_L2x2_SUB2:
+
+       KERNEL2x2_SUB1
+
+       addic.          L,      L,      -1
+       bgt             DGEMM_L2x2_SUB2
+
+DGEMM_L2x2_SAVE:
+
+       SAVE2x2
+
+DGEMM_L2x2_END:
+
+DGEMM_L2x1_BEGIN:
+
+
+       andi.           T1,     M,      1
+       ble             DGEMM_L2x1_END
+       mr              BO,     B
+       srawi.          L,      K,      3
+       ble             DGEMM_L2x1_SUB0
+       cmpwi           cr0,    L,      1
+       ble             DGEMM_L2x1_SUB4
+
+DGEMM_L2x1_LOOP_START:
+
+       LOAD2x1_1
+       KERNEL2x1_I1
+       KERNEL2x1_2
+       KERNEL2x1_1
+       KERNEL2x1_2
+
+       KERNEL2x1_1
+       KERNEL2x1_2
+       KERNEL2x1_1
+       KERNEL2x1_2
+
+       addic.          L,      L,      -2
+       ble             DGEMM_L2x1_LOOP_END
+
+       .align 5
+
+DGEMM_L2x1_LOOP:
+
+       KERNEL2x1_1
+       KERNEL2x1_2
+       KERNEL2x1_1
+       KERNEL2x1_2
+
+       KERNEL2x1_1
+       KERNEL2x1_2
+       KERNEL2x1_1
+       KERNEL2x1_2
+
+       addic.          L,      L,      -1
+       bgt             DGEMM_L2x1_LOOP
+
+DGEMM_L2x1_LOOP_END:
+
+       KERNEL2x1_1
+       KERNEL2x1_2
+       KERNEL2x1_1
+       KERNEL2x1_2
+
+       KERNEL2x1_1
+       KERNEL2x1_2
+       KERNEL2x1_1
+       KERNEL2x1_E2
+
+       b               DGEMM_L2x1_SUB1
+
+DGEMM_L2x1_SUB4:
+
+       KERNEL2x1_SUBI1
+       KERNEL2x1_SUB1
+       KERNEL2x1_SUB1
+       KERNEL2x1_SUB1
+
+       KERNEL2x1_SUB1
+       KERNEL2x1_SUB1
+       KERNEL2x1_SUB1
+       KERNEL2x1_SUB1
+
+       b               DGEMM_L2x1_SUB1
+
+DGEMM_L2x1_SUB0:
+
+       andi.           L,      K,      7
+
+       KERNEL2x1_SUBI1
+
+       addic.          L,      L,      -1
+       ble             DGEMM_L2x1_SAVE
+       b               DGEMM_L2x1_SUB2
+
+DGEMM_L2x1_SUB1:
+
+       andi.           L,      K,      7
+       ble             DGEMM_L2x1_SAVE
+
+DGEMM_L2x1_SUB2:
+
+       KERNEL2x1_SUB1
+
+       addic.          L,      L,      -1
+       bgt             DGEMM_L2x1_SUB2
+
+DGEMM_L2x1_SAVE:
+
+       SAVE2x1
+
+DGEMM_L2x1_END:
+
+       slwi            T1,     K,      4
+       add             B,      B,      T1
+
+DGEMM_L2_END:
+DGEMM_L1_BEGIN:
+
+       andi.           T1,     N,      1
+       ble             DGEMM_L1_END
+       mr              CO,     C
+       mr              AO,     A
+       srawi.          I,      M,      4
+       ble             DGEMM_L1x16_END
+
+DGEMM_L1x16_BEGIN:
+
+
+       mr              BO,     B
+       srawi.          L,      K,      3
+       ble             DGEMM_L1x16_SUB0
+       cmpwi           cr0,    L,      1
+       ble             DGEMM_L1x16_SUB4
+
+DGEMM_L1x16_LOOP_START:
+
+       dcbt            AO,     PRE
+       LOAD1x16_1
+       dcbt            AO,     PRE
+       KERNEL1x16_I1
+       dcbt            AO,     PRE
+       KERNEL1x16_2
+       dcbt            AO,     PRE
+       KERNEL1x16_1
+       dcbt            AO,     PRE
+       KERNEL1x16_2
+
+       dcbt            AO,     PRE
+       KERNEL1x16_1
+       dcbt            AO,     PRE
+       KERNEL1x16_2
+       dcbt            AO,     PRE
+       KERNEL1x16_1
+       dcbt            AO,     PRE
+       KERNEL1x16_2
+
+       addic.          L,      L,      -2
+       ble             DGEMM_L1x16_LOOP_END
+
+       .align 5
+
+DGEMM_L1x16_LOOP:
+
+       dcbt            AO,     PRE
+       KERNEL1x16_1
+       dcbt            AO,     PRE
+       KERNEL1x16_2
+       dcbt            AO,     PRE
+       KERNEL1x16_1
+       dcbt            AO,     PRE
+       KERNEL1x16_2
+
+       dcbt            AO,     PRE
+       KERNEL1x16_1
+       dcbt            AO,     PRE
+       KERNEL1x16_2
+       dcbt            AO,     PRE
+       KERNEL1x16_1
+       dcbt            AO,     PRE
+       KERNEL1x16_2
+
+       addic.          L,      L,      -1
+       bgt             DGEMM_L1x16_LOOP
+
+DGEMM_L1x16_LOOP_END:
+
+       dcbt            AO,     PRE
+       KERNEL1x16_1
+       dcbt            AO,     PRE
+       KERNEL1x16_2
+       dcbt            AO,     PRE
+       KERNEL1x16_1
+       dcbt            AO,     PRE
+       KERNEL1x16_2
+
+       dcbt            AO,     PRE
+       KERNEL1x16_1
+       dcbt            AO,     PRE
+       KERNEL1x16_2
+       dcbt            AO,     PRE
+       KERNEL1x16_1
+       KERNEL1x16_E2
+
+       b               DGEMM_L1x16_SUB1
+
+DGEMM_L1x16_SUB4:
+
+       dcbt            AO,     PRE
+       KERNEL1x16_SUBI1
+       dcbt            AO,     PRE
+       KERNEL1x16_SUB1
+       dcbt            AO,     PRE
+       KERNEL1x16_SUB1
+       dcbt            AO,     PRE
+       KERNEL1x16_SUB1
+
+       KERNEL1x16_SUB1
+       KERNEL1x16_SUB1
+       KERNEL1x16_SUB1
+       KERNEL1x16_SUB1
+
+       b               DGEMM_L1x16_SUB1
+
+DGEMM_L1x16_SUB0:
+
+       andi.           L,      K,      7
+
+       KERNEL1x16_SUBI1
+
+       addic.          L,      L,      -1
+       ble             DGEMM_L1x16_SAVE
+       b               DGEMM_L1x16_SUB2
+
+DGEMM_L1x16_SUB1:
+
+       andi.           L,      K,      7
+       ble             DGEMM_L1x16_SAVE
+
+DGEMM_L1x16_SUB2:
+
+       KERNEL1x16_SUB1
+
+       addic.          L,      L,      -1
+       bgt             DGEMM_L1x16_SUB2
+
+DGEMM_L1x16_SAVE:
+
+       SAVE1x16
+
+       addic.          I,      I,      -1
+       bgt             DGEMM_L1x16_BEGIN
+
+DGEMM_L1x16_END:
+
+DGEMM_L1x8_BEGIN:
+
+       andi.           T2,     M,      15
+       ble             DGEMM_L1x1_END
+
+       andi.           T1,     M,      8
+       ble             DGEMM_L1x8_END
+       mr              BO,     B
+       srawi.          L,      K,      3
+       ble             DGEMM_L1x8_SUB0
+       cmpwi           cr0,    L,      1
+       ble             DGEMM_L1x8_SUB4
+
+DGEMM_L1x8_LOOP_START:
+
+       LOAD1x8_1
+       KERNEL1x8_I1
+       KERNEL1x8_2
+       KERNEL1x8_1
+       KERNEL1x8_2
+
+       KERNEL1x8_1
+       KERNEL1x8_2
+       KERNEL1x8_1
+       KERNEL1x8_2
+
+       addic.          L,      L,      -2
+       ble             DGEMM_L1x8_LOOP_END
+
+       .align 5
+
+DGEMM_L1x8_LOOP:
+
+       KERNEL1x8_1
+       KERNEL1x8_2
+       KERNEL1x8_1
+       KERNEL1x8_2
+
+       KERNEL1x8_1
+       KERNEL1x8_2
+       KERNEL1x8_1
+       KERNEL1x8_2
+
+       addic.          L,      L,      -1
+       bgt             DGEMM_L1x8_LOOP
+
+DGEMM_L1x8_LOOP_END:
+
+       KERNEL1x8_1
+       KERNEL1x8_2
+       KERNEL1x8_1
+       KERNEL1x8_2
+
+       KERNEL1x8_1
+       KERNEL1x8_2
+       KERNEL1x8_1
+       KERNEL1x8_E2
+
+       b               DGEMM_L1x8_SUB1
+
+DGEMM_L1x8_SUB4:
+
+       KERNEL1x8_SUBI1
+       KERNEL1x8_SUB1
+       KERNEL1x8_SUB1
+       KERNEL1x8_SUB1
+
+       KERNEL1x8_SUB1
+       KERNEL1x8_SUB1
+       KERNEL1x8_SUB1
+       KERNEL1x8_SUB1
+
+       b               DGEMM_L1x8_SUB1
+
+DGEMM_L1x8_SUB0:
+
+       andi.           L,      K,      7
+
+       KERNEL1x8_SUBI1
+
+       addic.          L,      L,      -1
+       ble             DGEMM_L1x8_SAVE
+       b               DGEMM_L1x8_SUB2
+
+DGEMM_L1x8_SUB1:
+
+       andi.           L,      K,      7
+       ble             DGEMM_L1x8_SAVE
+
+DGEMM_L1x8_SUB2:
+
+       KERNEL1x8_SUB1
+
+       addic.          L,      L,      -1
+       bgt             DGEMM_L1x8_SUB2
+
+DGEMM_L1x8_SAVE:
+
+       SAVE1x8
+
+DGEMM_L1x8_END:
+
+DGEMM_L1x4_BEGIN:
+
+
+       andi.           T1,     M,      4
+       ble             DGEMM_L1x4_END
+       mr              BO,     B
+       srawi.          L,      K,      3
+       ble             DGEMM_L1x4_SUB0
+       cmpwi           cr0,    L,      1
+       ble             DGEMM_L1x4_SUB4
+
+DGEMM_L1x4_LOOP_START:
+
+       LOAD1x4_1
+       KERNEL1x4_I1
+       KERNEL1x4_2
+       KERNEL1x4_1
+       KERNEL1x4_2
+
+       KERNEL1x4_1
+       KERNEL1x4_2
+       KERNEL1x4_1
+       KERNEL1x4_2
+
+       addic.          L,      L,      -2
+       ble             DGEMM_L1x4_LOOP_END
+
+       .align 5
+
+DGEMM_L1x4_LOOP:
+
+       KERNEL1x4_1
+       KERNEL1x4_2
+       KERNEL1x4_1
+       KERNEL1x4_2
+
+       KERNEL1x4_1
+       KERNEL1x4_2
+       KERNEL1x4_1
+       KERNEL1x4_2
+
+       addic.          L,      L,      -1
+       bgt             DGEMM_L1x4_LOOP
+
+DGEMM_L1x4_LOOP_END:
+
+       KERNEL1x4_1
+       KERNEL1x4_2
+       KERNEL1x4_1
+       KERNEL1x4_2
+
+       KERNEL1x4_1
+       KERNEL1x4_2
+       KERNEL1x4_1
+       KERNEL1x4_E2
+
+       b               DGEMM_L1x4_SUB1
+
+DGEMM_L1x4_SUB4:
+
+       KERNEL1x4_SUBI1
+       KERNEL1x4_SUB1
+       KERNEL1x4_SUB1
+       KERNEL1x4_SUB1
+
+       KERNEL1x4_SUB1
+       KERNEL1x4_SUB1
+       KERNEL1x4_SUB1
+       KERNEL1x4_SUB1
+
+       b               DGEMM_L1x4_SUB1
+
+DGEMM_L1x4_SUB0:
+
+       andi.           L,      K,      7
+
+       KERNEL1x4_SUBI1
+
+       addic.          L,      L,      -1
+       ble             DGEMM_L1x4_SAVE
+       b               DGEMM_L1x4_SUB2
+
+DGEMM_L1x4_SUB1:
+
+       andi.           L,      K,      7
+       ble             DGEMM_L1x4_SAVE
+
+DGEMM_L1x4_SUB2:
+
+       KERNEL1x4_SUB1
+
+       addic.          L,      L,      -1
+       bgt             DGEMM_L1x4_SUB2
+
+DGEMM_L1x4_SAVE:
+
+       SAVE1x4
+
+DGEMM_L1x4_END:
+
+DGEMM_L1x2_BEGIN:
+
+
+       andi.           T1,     M,      2
+       ble             DGEMM_L1x2_END
+       mr              BO,     B
+       srawi.          L,      K,      3
+       ble             DGEMM_L1x2_SUB0
+       cmpwi           cr0,    L,      1
+       ble             DGEMM_L1x2_SUB4
+
+DGEMM_L1x2_LOOP_START:
+
+       LOAD1x2_1
+       KERNEL1x2_I1
+       KERNEL1x2_2
+       KERNEL1x2_1
+       KERNEL1x2_2
+
+       KERNEL1x2_1
+       KERNEL1x2_2
+       KERNEL1x2_1
+       KERNEL1x2_2
+
+       addic.          L,      L,      -2
+       ble             DGEMM_L1x2_LOOP_END
+
+       .align 5
+
+DGEMM_L1x2_LOOP:
+
+       KERNEL1x2_1
+       KERNEL1x2_2
+       KERNEL1x2_1
+       KERNEL1x2_2
+
+       KERNEL1x2_1
+       KERNEL1x2_2
+       KERNEL1x2_1
+       KERNEL1x2_2
+
+       addic.          L,      L,      -1
+       bgt             DGEMM_L1x2_LOOP
+
+DGEMM_L1x2_LOOP_END:
+
+       KERNEL1x2_1
+       KERNEL1x2_2
+       KERNEL1x2_1
+       KERNEL1x2_2
+
+       KERNEL1x2_1
+       KERNEL1x2_2
+       KERNEL1x2_1
+       KERNEL1x2_E2
+
+       b               DGEMM_L1x2_SUB1
+
+DGEMM_L1x2_SUB4:
+
+       KERNEL1x2_SUBI1
+       KERNEL1x2_SUB1
+       KERNEL1x2_SUB1
+       KERNEL1x2_SUB1
+
+       KERNEL1x2_SUB1
+       KERNEL1x2_SUB1
+       KERNEL1x2_SUB1
+       KERNEL1x2_SUB1
+
+       b               DGEMM_L1x2_SUB1
+
+DGEMM_L1x2_SUB0:
+
+       andi.           L,      K,      7
+
+       KERNEL1x2_SUBI1
+
+       addic.          L,      L,      -1
+       ble             DGEMM_L1x2_SAVE
+       b               DGEMM_L1x2_SUB2
+
+DGEMM_L1x2_SUB1:
+
+       andi.           L,      K,      7
+       ble             DGEMM_L1x2_SAVE
+
+DGEMM_L1x2_SUB2:
+
+       KERNEL1x2_SUB1
+
+       addic.          L,      L,      -1
+       bgt             DGEMM_L1x2_SUB2
+
+DGEMM_L1x2_SAVE:
+
+       SAVE1x2
+
+DGEMM_L1x2_END:
+
+DGEMM_L1x1_BEGIN:
+
+
+       andi.           T1,     M,      1
+       ble             DGEMM_L1x1_END
+       mr              BO,     B
+       srawi.          L,      K,      3
+       ble             DGEMM_L1x1_SUB0
+       cmpwi           cr0,    L,      1
+       ble             DGEMM_L1x1_SUB4
+
+DGEMM_L1x1_LOOP_START:
+
+       LOAD1x1_1
+       KERNEL1x1_I1
+       KERNEL1x1_2
+       KERNEL1x1_1
+       KERNEL1x1_2
+
+       KERNEL1x1_1
+       KERNEL1x1_2
+       KERNEL1x1_1
+       KERNEL1x1_2
+
+       addic.          L,      L,      -2
+       ble             DGEMM_L1x1_LOOP_END
+
+       .align 5
+
+DGEMM_L1x1_LOOP:
+
+       KERNEL1x1_1
+       KERNEL1x1_2
+       KERNEL1x1_1
+       KERNEL1x1_2
+
+       KERNEL1x1_1
+       KERNEL1x1_2
+       KERNEL1x1_1
+       KERNEL1x1_2
+
+       addic.          L,      L,      -1
+       bgt             DGEMM_L1x1_LOOP
+
+DGEMM_L1x1_LOOP_END:
+
+       KERNEL1x1_1
+       KERNEL1x1_2
+       KERNEL1x1_1
+       KERNEL1x1_2
+
+       KERNEL1x1_1
+       KERNEL1x1_2
+       KERNEL1x1_1
+       KERNEL1x1_E2
+
+       b               DGEMM_L1x1_SUB1
+
+DGEMM_L1x1_SUB4:
+
+       KERNEL1x1_SUBI1
+       KERNEL1x1_SUB1
+       KERNEL1x1_SUB1
+       KERNEL1x1_SUB1
+
+       KERNEL1x1_SUB1
+       KERNEL1x1_SUB1
+       KERNEL1x1_SUB1
+       KERNEL1x1_SUB1
+
+       b               DGEMM_L1x1_SUB1
+
+DGEMM_L1x1_SUB0:
+
+       andi.           L,      K,      7
+
+       KERNEL1x1_SUBI1
+
+       addic.          L,      L,      -1
+       ble             DGEMM_L1x1_SAVE
+       b               DGEMM_L1x1_SUB2
+
+DGEMM_L1x1_SUB1:
+
+       andi.           L,      K,      7
+       ble             DGEMM_L1x1_SAVE
+
+DGEMM_L1x1_SUB2:
+
+       KERNEL1x1_SUB1
+
+       addic.          L,      L,      -1
+       bgt             DGEMM_L1x1_SUB2
+
+DGEMM_L1x1_SAVE:
+
+       SAVE1x1
+
+DGEMM_L1x1_END:
+
+DGEMM_L1_END:
diff --git a/kernel/power/dgemm_macros_16x4_power8.S b/kernel/power/dgemm_macros_16x4_power8.S
new file mode 100644 (file)
index 0000000..d409098
--- /dev/null
@@ -0,0 +1,3400 @@
+/*********************************************************************
+* Macros for N=4, M=16                                               *
+*********************************************************************/
+
+.macro LOAD4x16_1
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+       lxvd2x  vs2,    o32,    AO
+       lxvd2x  vs3,    o48,    AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+
+       addi            AO, AO, 64
+
+       lxvd2x  vs4,    0,      AO
+       lxvd2x  vs5,    o16,    AO
+       lxvd2x  vs6,    o32,    AO
+       lxvd2x  vs7,    o48,    AO
+
+       lxvdsx  vs26,   o16,    BO
+       lxvdsx  vs27,   o24,    BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 32
+
+.endm
+
+.macro KERNEL4x16_I1
+
+       xvmuldp                 vs32,   vs0,    vs24
+       xvmuldp                 vs33,   vs1,    vs24
+       xvmuldp                 vs34,   vs2,    vs24
+       xvmuldp                 vs35,   vs3,    vs24
+
+       lxvd2x  vs8,    0,      AO
+       lxvd2x  vs9,    o16,    AO
+
+       xvmuldp                 vs36,   vs4,    vs24
+       xvmuldp                 vs37,   vs5,    vs24
+       xvmuldp                 vs38,   vs6,    vs24
+       xvmuldp                 vs39,   vs7,    vs24
+
+       lxvdsx  vs28,   0,      BO
+       lxvdsx  vs29,   o8,     BO
+
+       xvmuldp                 vs40,   vs0,    vs25
+       xvmuldp                 vs41,   vs1,    vs25
+       xvmuldp                 vs42,   vs2,    vs25
+       xvmuldp                 vs43,   vs3,    vs25
+
+       lxvd2x  vs10,   o32,    AO
+       lxvd2x  vs11,   o48,    AO
+
+       xvmuldp                 vs44,   vs4,    vs25
+       xvmuldp                 vs45,   vs5,    vs25
+       xvmuldp                 vs46,   vs6,    vs25
+       xvmuldp                 vs47,   vs7,    vs25
+
+       addi            AO, AO, 64
+
+       xvmuldp                 vs48,   vs0,    vs26
+       xvmuldp                 vs49,   vs1,    vs26
+       xvmuldp                 vs50,   vs2,    vs26
+       xvmuldp                 vs51,   vs3,    vs26
+
+       lxvd2x  vs12,   0,      AO
+       lxvd2x  vs13,   o16,    AO
+
+       xvmuldp                 vs52,   vs4,    vs26
+       xvmuldp                 vs53,   vs5,    vs26
+       xvmuldp                 vs54,   vs6,    vs26
+       xvmuldp                 vs55,   vs7,    vs26
+
+       lxvd2x  vs14,   o32,    AO
+       lxvd2x  vs15,   o48,    AO
+
+       xvmuldp                 vs56,   vs0,    vs27
+       xvmuldp                 vs57,   vs1,    vs27
+       xvmuldp                 vs58,   vs2,    vs27
+       xvmuldp                 vs59,   vs3,    vs27
+
+       lxvdsx  vs30,   o16,    BO
+       lxvdsx  vs31,   o24,    BO
+
+       xvmuldp                 vs60,   vs4,    vs27
+       xvmuldp                 vs61,   vs5,    vs27
+       xvmuldp                 vs62,   vs6,    vs27
+       xvmuldp                 vs63,   vs7,    vs27
+
+       addi            AO, AO, 64
+       addi            BO, BO, 32
+
+.endm
+
+.macro KERNEL4x16_1
+
+       xvmaddadp               vs32,   vs0,    vs24
+       xvmaddadp               vs33,   vs1,    vs24
+       xvmaddadp               vs34,   vs2,    vs24
+       xvmaddadp               vs35,   vs3,    vs24
+
+       lxvd2x  vs8,    0,      AO
+       lxvd2x  vs9,    o16,    AO
+
+       xvmaddadp               vs36,   vs4,    vs24
+       xvmaddadp               vs37,   vs5,    vs24
+       xvmaddadp               vs38,   vs6,    vs24
+       xvmaddadp               vs39,   vs7,    vs24
+
+       lxvdsx  vs28,   0,      BO
+       lxvdsx  vs29,   o8,     BO
+
+       xvmaddadp               vs40,   vs0,    vs25
+       xvmaddadp               vs41,   vs1,    vs25
+       xvmaddadp               vs42,   vs2,    vs25
+       xvmaddadp               vs43,   vs3,    vs25
+
+       lxvd2x  vs10,   o32,    AO
+       lxvd2x  vs11,   o48,    AO
+
+       xvmaddadp               vs44,   vs4,    vs25
+       xvmaddadp               vs45,   vs5,    vs25
+       xvmaddadp               vs46,   vs6,    vs25
+       xvmaddadp               vs47,   vs7,    vs25
+
+       addi            AO, AO, 64
+
+       xvmaddadp               vs48,   vs0,    vs26
+       xvmaddadp               vs49,   vs1,    vs26
+       xvmaddadp               vs50,   vs2,    vs26
+       xvmaddadp               vs51,   vs3,    vs26
+
+       lxvd2x  vs12,   0,      AO
+       lxvd2x  vs13,   o16,    AO
+
+       xvmaddadp               vs52,   vs4,    vs26
+       xvmaddadp               vs53,   vs5,    vs26
+       xvmaddadp               vs54,   vs6,    vs26
+       xvmaddadp               vs55,   vs7,    vs26
+
+       lxvd2x  vs14,   o32,    AO
+       lxvd2x  vs15,   o48,    AO
+
+       xvmaddadp               vs56,   vs0,    vs27
+       xvmaddadp               vs57,   vs1,    vs27
+       xvmaddadp               vs58,   vs2,    vs27
+       xvmaddadp               vs59,   vs3,    vs27
+
+
+       lxvdsx  vs30,   o16,    BO
+       lxvdsx  vs31,   o24,    BO
+
+       xvmaddadp               vs60,   vs4,    vs27
+       xvmaddadp               vs61,   vs5,    vs27
+       xvmaddadp               vs62,   vs6,    vs27
+       xvmaddadp               vs63,   vs7,    vs27
+
+       addi            AO, AO, 64
+       addi            BO, BO, 32
+
+.endm
+
+.macro KERNEL4x16_2
+
+       xvmaddadp               vs32,   vs8,    vs28
+       xvmaddadp               vs33,   vs9,    vs28
+       xvmaddadp               vs34,   vs10,   vs28
+       xvmaddadp               vs35,   vs11,   vs28
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+
+       xvmaddadp               vs36,   vs12,   vs28
+       xvmaddadp               vs37,   vs13,   vs28
+       xvmaddadp               vs38,   vs14,   vs28
+       xvmaddadp               vs39,   vs15,   vs28
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+
+       xvmaddadp               vs40,   vs8,    vs29
+       xvmaddadp               vs41,   vs9,    vs29
+       xvmaddadp               vs42,   vs10,   vs29
+       xvmaddadp               vs43,   vs11,   vs29
+
+       lxvd2x  vs2,    o32,    AO
+       lxvd2x  vs3,    o48,    AO
+
+       xvmaddadp               vs44,   vs12,   vs29
+       xvmaddadp               vs45,   vs13,   vs29
+       xvmaddadp               vs46,   vs14,   vs29
+       xvmaddadp               vs47,   vs15,   vs29
+
+       addi            AO, AO, 64
+
+       xvmaddadp               vs48,   vs8,    vs30
+       xvmaddadp               vs49,   vs9,    vs30
+       xvmaddadp               vs50,   vs10,   vs30
+       xvmaddadp               vs51,   vs11,   vs30
+
+       lxvd2x  vs4,    0,      AO
+       lxvd2x  vs5,    o16,    AO
+
+       xvmaddadp               vs52,   vs12,   vs30
+       xvmaddadp               vs53,   vs13,   vs30
+       xvmaddadp               vs54,   vs14,   vs30
+       xvmaddadp               vs55,   vs15,   vs30
+
+       lxvd2x  vs6,    o32,    AO
+       lxvd2x  vs7,    o48,    AO
+
+       xvmaddadp               vs56,   vs8,    vs31
+       xvmaddadp               vs57,   vs9,    vs31
+       xvmaddadp               vs58,   vs10,   vs31
+       xvmaddadp               vs59,   vs11,   vs31
+
+       lxvdsx  vs26,   o16,    BO
+       lxvdsx  vs27,   o24,    BO
+
+       xvmaddadp               vs60,   vs12,   vs31
+       xvmaddadp               vs61,   vs13,   vs31
+       xvmaddadp               vs62,   vs14,   vs31
+       xvmaddadp               vs63,   vs15,   vs31
+
+       addi            AO, AO, 64
+       addi            BO, BO, 32
+
+.endm
+
+.macro KERNEL4x16_E2
+
+
+       xvmaddadp               vs32,   vs8,    vs28
+       xvmaddadp               vs33,   vs9,    vs28
+       xvmaddadp               vs34,   vs10,   vs28
+       xvmaddadp               vs35,   vs11,   vs28
+       xvmaddadp               vs36,   vs12,   vs28
+       xvmaddadp               vs37,   vs13,   vs28
+       xvmaddadp               vs38,   vs14,   vs28
+       xvmaddadp               vs39,   vs15,   vs28
+
+       xvmaddadp               vs40,   vs8,    vs29
+       xvmaddadp               vs41,   vs9,    vs29
+       xvmaddadp               vs42,   vs10,   vs29
+       xvmaddadp               vs43,   vs11,   vs29
+       xvmaddadp               vs44,   vs12,   vs29
+       xvmaddadp               vs45,   vs13,   vs29
+       xvmaddadp               vs46,   vs14,   vs29
+       xvmaddadp               vs47,   vs15,   vs29
+
+       xvmaddadp               vs48,   vs8,    vs30
+       xvmaddadp               vs49,   vs9,    vs30
+       xvmaddadp               vs50,   vs10,   vs30
+       xvmaddadp               vs51,   vs11,   vs30
+       xvmaddadp               vs52,   vs12,   vs30
+       xvmaddadp               vs53,   vs13,   vs30
+       xvmaddadp               vs54,   vs14,   vs30
+       xvmaddadp               vs55,   vs15,   vs30
+
+       xvmaddadp               vs56,   vs8,    vs31
+       xvmaddadp               vs57,   vs9,    vs31
+       xvmaddadp               vs58,   vs10,   vs31
+       xvmaddadp               vs59,   vs11,   vs31
+       xvmaddadp               vs60,   vs12,   vs31
+       xvmaddadp               vs61,   vs13,   vs31
+       xvmaddadp               vs62,   vs14,   vs31
+       xvmaddadp               vs63,   vs15,   vs31
+
+.endm
+
+.macro KERNEL4x16_SUBI1
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+       lxvd2x  vs2,    o32,    AO
+       lxvd2x  vs3,    o48,    AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+       lxvdsx  vs26,   o16,    BO
+       lxvdsx  vs27,   o24,    BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 32
+
+       lxvd2x  vs4,    0,      AO
+       lxvd2x  vs5,    o16,    AO
+       lxvd2x  vs6,    o32,    AO
+       lxvd2x  vs7,    o48,    AO
+
+       addi            AO, AO, 64
+
+
+       xvmuldp                 vs32,   vs0,    vs24
+       xvmuldp                 vs33,   vs1,    vs24
+       xvmuldp                 vs34,   vs2,    vs24
+       xvmuldp                 vs35,   vs3,    vs24
+       xvmuldp                 vs36,   vs4,    vs24
+       xvmuldp                 vs37,   vs5,    vs24
+       xvmuldp                 vs38,   vs6,    vs24
+       xvmuldp                 vs39,   vs7,    vs24
+
+       xvmuldp                 vs40,   vs0,    vs25
+       xvmuldp                 vs41,   vs1,    vs25
+       xvmuldp                 vs42,   vs2,    vs25
+       xvmuldp                 vs43,   vs3,    vs25
+       xvmuldp                 vs44,   vs4,    vs25
+       xvmuldp                 vs45,   vs5,    vs25
+       xvmuldp                 vs46,   vs6,    vs25
+       xvmuldp                 vs47,   vs7,    vs25
+
+       xvmuldp                 vs48,   vs0,    vs26
+       xvmuldp                 vs49,   vs1,    vs26
+       xvmuldp                 vs50,   vs2,    vs26
+       xvmuldp                 vs51,   vs3,    vs26
+       xvmuldp                 vs52,   vs4,    vs26
+       xvmuldp                 vs53,   vs5,    vs26
+       xvmuldp                 vs54,   vs6,    vs26
+       xvmuldp                 vs55,   vs7,    vs26
+
+       xvmuldp                 vs56,   vs0,    vs27
+       xvmuldp                 vs57,   vs1,    vs27
+       xvmuldp                 vs58,   vs2,    vs27
+       xvmuldp                 vs59,   vs3,    vs27
+       xvmuldp                 vs60,   vs4,    vs27
+       xvmuldp                 vs61,   vs5,    vs27
+       xvmuldp                 vs62,   vs6,    vs27
+       xvmuldp                 vs63,   vs7,    vs27
+
+.endm
+
+.macro KERNEL4x16_SUB1
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+       lxvd2x  vs2,    o32,    AO
+       lxvd2x  vs3,    o48,    AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+       lxvdsx  vs26,   o16,    BO
+       lxvdsx  vs27,   o24,    BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 32
+
+       lxvd2x  vs4,    0,      AO
+       lxvd2x  vs5,    o16,    AO
+       lxvd2x  vs6,    o32,    AO
+       lxvd2x  vs7,    o48,    AO
+
+       addi            AO, AO, 64
+
+
+       xvmaddadp               vs32,   vs0,    vs24
+       xvmaddadp               vs33,   vs1,    vs24
+       xvmaddadp               vs34,   vs2,    vs24
+       xvmaddadp               vs35,   vs3,    vs24
+       xvmaddadp               vs36,   vs4,    vs24
+       xvmaddadp               vs37,   vs5,    vs24
+       xvmaddadp               vs38,   vs6,    vs24
+       xvmaddadp               vs39,   vs7,    vs24
+
+       xvmaddadp               vs40,   vs0,    vs25
+       xvmaddadp               vs41,   vs1,    vs25
+       xvmaddadp               vs42,   vs2,    vs25
+       xvmaddadp               vs43,   vs3,    vs25
+       xvmaddadp               vs44,   vs4,    vs25
+       xvmaddadp               vs45,   vs5,    vs25
+       xvmaddadp               vs46,   vs6,    vs25
+       xvmaddadp               vs47,   vs7,    vs25
+
+       xvmaddadp               vs48,   vs0,    vs26
+       xvmaddadp               vs49,   vs1,    vs26
+       xvmaddadp               vs50,   vs2,    vs26
+       xvmaddadp               vs51,   vs3,    vs26
+       xvmaddadp               vs52,   vs4,    vs26
+       xvmaddadp               vs53,   vs5,    vs26
+       xvmaddadp               vs54,   vs6,    vs26
+       xvmaddadp               vs55,   vs7,    vs26
+
+       xvmaddadp               vs56,   vs0,    vs27
+       xvmaddadp               vs57,   vs1,    vs27
+       xvmaddadp               vs58,   vs2,    vs27
+       xvmaddadp               vs59,   vs3,    vs27
+       xvmaddadp               vs60,   vs4,    vs27
+       xvmaddadp               vs61,   vs5,    vs27
+       xvmaddadp               vs62,   vs6,    vs27
+       xvmaddadp               vs63,   vs7,    vs27
+
+.endm
+
+.macro SAVE4x16
+
+       mr              T1,     CO
+       addi            T2,     T1,     64
+
+#ifndef TRMMKERNEL
+       lxvd2x          vs0,    0,      T1
+       lxvd2x          vs1,    o16,    T1
+       lxvd2x          vs2,    o32,    T1
+       lxvd2x          vs3,    o48,    T1
+
+       lxvd2x          vs4,    0,      T2
+       lxvd2x          vs5,    o16,    T2
+       lxvd2x          vs6,    o32,    T2
+       lxvd2x          vs7,    o48,    T2
+#endif
+
+#ifndef TRMMKERNEL
+       xvmaddadp       vs0,    vs32,   alpha_r
+       xvmaddadp       vs1,    vs33,   alpha_r
+       xvmaddadp       vs2,    vs34,   alpha_r
+       xvmaddadp       vs3,    vs35,   alpha_r
+       xvmaddadp       vs4,    vs36,   alpha_r
+       xvmaddadp       vs5,    vs37,   alpha_r
+       xvmaddadp       vs6,    vs38,   alpha_r
+       xvmaddadp       vs7,    vs39,   alpha_r
+#else
+       xvmuldp         vs0,    vs32,   alpha_r
+       xvmuldp         vs1,    vs33,   alpha_r
+       xvmuldp         vs2,    vs34,   alpha_r
+       xvmuldp         vs3,    vs35,   alpha_r
+       xvmuldp         vs4,    vs36,   alpha_r
+       xvmuldp         vs5,    vs37,   alpha_r
+       xvmuldp         vs6,    vs38,   alpha_r
+       xvmuldp         vs7,    vs39,   alpha_r
+#endif
+
+       stxvd2x         vs0,    0,      T1
+       stxvd2x         vs1,    o16,    T1
+       stxvd2x         vs2,    o32,    T1
+       stxvd2x         vs3,    o48,    T1
+
+       dcbt            T1, PRE
+
+       stxvd2x         vs4,    0,      T2
+       stxvd2x         vs5,    o16,    T2
+       stxvd2x         vs6,    o32,    T2
+       stxvd2x         vs7,    o48,    T2
+
+       add             T1,     T1,     LDC
+       add             T2,     T2,     LDC
+
+#ifndef TRMMKERNEL
+       lxvd2x          vs8,    0,      T1
+       lxvd2x          vs9,    o16,    T1
+       lxvd2x          vs10,   o32,    T1
+       lxvd2x          vs11,   o48,    T1
+
+       lxvd2x          vs12,   0,      T2
+       lxvd2x          vs13,   o16,    T2
+       lxvd2x          vs14,   o32,    T2
+       lxvd2x          vs15,   o48,    T2
+#endif
+
+#ifndef TRMMKERNEL
+       xvmaddadp       vs8,    vs40,   alpha_r
+       xvmaddadp       vs9,    vs41,   alpha_r
+       xvmaddadp       vs10,   vs42,   alpha_r
+       xvmaddadp       vs11,   vs43,   alpha_r
+       xvmaddadp       vs12,   vs44,   alpha_r
+       xvmaddadp       vs13,   vs45,   alpha_r
+       xvmaddadp       vs14,   vs46,   alpha_r
+       xvmaddadp       vs15,   vs47,   alpha_r
+#else
+       xvmuldp         vs8,    vs40,   alpha_r
+       xvmuldp         vs9,    vs41,   alpha_r
+       xvmuldp         vs10,   vs42,   alpha_r
+       xvmuldp         vs11,   vs43,   alpha_r
+       xvmuldp         vs12,   vs44,   alpha_r
+       xvmuldp         vs13,   vs45,   alpha_r
+       xvmuldp         vs14,   vs46,   alpha_r
+       xvmuldp         vs15,   vs47,   alpha_r
+#endif
+
+       stxvd2x         vs8,    0,      T1
+       stxvd2x         vs9,    o16,    T1
+       stxvd2x         vs10,   o32,    T1
+       stxvd2x         vs11,   o48,    T1
+
+       dcbt            T1, PRE
+
+       stxvd2x         vs12,   0,      T2
+       stxvd2x         vs13,   o16,    T2
+       stxvd2x         vs14,   o32,    T2
+       stxvd2x         vs15,   o48,    T2
+
+       add             T1,     T1,     LDC
+       add             T2,     T2,     LDC
+
+#ifndef TRMMKERNEL
+       lxvd2x          vs0,    0,      T1
+       lxvd2x          vs1,    o16,    T1
+       lxvd2x          vs2,    o32,    T1
+       lxvd2x          vs3,    o48,    T1
+
+       lxvd2x          vs4,    0,      T2
+       lxvd2x          vs5,    o16,    T2
+       lxvd2x          vs6,    o32,    T2
+       lxvd2x          vs7,    o48,    T2
+#endif
+
+#ifndef TRMMKERNEL
+       xvmaddadp       vs0,    vs48,   alpha_r
+       xvmaddadp       vs1,    vs49,   alpha_r
+       xvmaddadp       vs2,    vs50,   alpha_r
+       xvmaddadp       vs3,    vs51,   alpha_r
+       xvmaddadp       vs4,    vs52,   alpha_r
+       xvmaddadp       vs5,    vs53,   alpha_r
+       xvmaddadp       vs6,    vs54,   alpha_r
+       xvmaddadp       vs7,    vs55,   alpha_r
+#else
+       xvmuldp         vs0,    vs48,   alpha_r
+       xvmuldp         vs1,    vs49,   alpha_r
+       xvmuldp         vs2,    vs50,   alpha_r
+       xvmuldp         vs3,    vs51,   alpha_r
+       xvmuldp         vs4,    vs52,   alpha_r
+       xvmuldp         vs5,    vs53,   alpha_r
+       xvmuldp         vs6,    vs54,   alpha_r
+       xvmuldp         vs7,    vs55,   alpha_r
+#endif
+
+       stxvd2x         vs0,    0,      T1
+       stxvd2x         vs1,    o16,    T1
+       stxvd2x         vs2,    o32,    T1
+       stxvd2x         vs3,    o48,    T1
+
+       dcbt            T1, PRE
+
+       stxvd2x         vs4,    0,      T2
+       stxvd2x         vs5,    o16,    T2
+       stxvd2x         vs6,    o32,    T2
+       stxvd2x         vs7,    o48,    T2
+
+       add             T1,     T1,     LDC
+       add             T2,     T2,     LDC
+
+#ifndef TRMMKERNEL
+       lxvd2x          vs8,    0,      T1
+       lxvd2x          vs9,    o16,    T1
+       lxvd2x          vs10,   o32,    T1
+       lxvd2x          vs11,   o48,    T1
+
+       lxvd2x          vs12,   0,      T2
+       lxvd2x          vs13,   o16,    T2
+       lxvd2x          vs14,   o32,    T2
+       lxvd2x          vs15,   o48,    T2
+#endif
+
+#ifndef TRMMKERNEL
+       xvmaddadp       vs8,    vs56,   alpha_r
+       xvmaddadp       vs9,    vs57,   alpha_r
+       xvmaddadp       vs10,   vs58,   alpha_r
+       xvmaddadp       vs11,   vs59,   alpha_r
+       xvmaddadp       vs12,   vs60,   alpha_r
+       xvmaddadp       vs13,   vs61,   alpha_r
+       xvmaddadp       vs14,   vs62,   alpha_r
+       xvmaddadp       vs15,   vs63,   alpha_r
+#else
+       xvmuldp         vs8,    vs56,   alpha_r
+       xvmuldp         vs9,    vs57,   alpha_r
+       xvmuldp         vs10,   vs58,   alpha_r
+       xvmuldp         vs11,   vs59,   alpha_r
+       xvmuldp         vs12,   vs60,   alpha_r
+       xvmuldp         vs13,   vs61,   alpha_r
+       xvmuldp         vs14,   vs62,   alpha_r
+       xvmuldp         vs15,   vs63,   alpha_r
+#endif
+
+       stxvd2x         vs8,    0,      T1
+       stxvd2x         vs9,    o16,    T1
+       stxvd2x         vs10,   o32,    T1
+       stxvd2x         vs11,   o48,    T1
+
+       dcbt            T1, PRE
+
+       stxvd2x         vs12,   0,      T2
+       stxvd2x         vs13,   o16,    T2
+       stxvd2x         vs14,   o32,    T2
+       stxvd2x         vs15,   o48,    T2
+
+       addi            CO,     CO,     128
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=8                                                *
+*********************************************************************/
+
+.macro LOAD4x8_1
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+
+       lxvd2x  vs2,    o32,    AO
+       lxvd2x  vs3,    o48,    AO
+
+       lxvdsx  vs26,   o16,    BO
+       lxvdsx  vs27,   o24,    BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 32
+
+.endm
+
+.macro KERNEL4x8_I1
+
+       xvmuldp                 vs32,   vs0,    vs24
+       xvmuldp                 vs33,   vs1,    vs24
+       xvmuldp                 vs34,   vs2,    vs24
+       xvmuldp                 vs35,   vs3,    vs24
+
+       lxvd2x  vs8,    0,      AO
+       lxvd2x  vs9,    o16,    AO
+
+       xvmuldp                 vs40,   vs0,    vs25
+       xvmuldp                 vs41,   vs1,    vs25
+
+       lxvdsx  vs28,   0,      BO
+       lxvdsx  vs29,   o8,     BO
+
+       xvmuldp                 vs42,   vs2,    vs25
+       xvmuldp                 vs43,   vs3,    vs25
+
+       xvmuldp                 vs48,   vs0,    vs26
+       xvmuldp                 vs49,   vs1,    vs26
+
+       lxvd2x  vs10,   o32,    AO
+       lxvd2x  vs11,   o48,    AO
+
+       xvmuldp                 vs50,   vs2,    vs26
+       xvmuldp                 vs51,   vs3,    vs26
+
+       lxvdsx  vs30,   o16,    BO
+       lxvdsx  vs31,   o24,    BO
+
+       xvmuldp                 vs56,   vs0,    vs27
+       xvmuldp                 vs57,   vs1,    vs27
+       xvmuldp                 vs58,   vs2,    vs27
+       xvmuldp                 vs59,   vs3,    vs27
+
+       addi            AO, AO, 64
+       addi            BO, BO, 32
+
+.endm
+
+.macro KERNEL4x8_1
+
+       xvmaddadp               vs32,   vs0,    vs24
+       xvmaddadp               vs33,   vs1,    vs24
+       xvmaddadp               vs34,   vs2,    vs24
+       xvmaddadp               vs35,   vs3,    vs24
+
+       lxvd2x  vs8,    0,      AO
+       lxvd2x  vs9,    o16,    AO
+
+       xvmaddadp               vs40,   vs0,    vs25
+       xvmaddadp               vs41,   vs1,    vs25
+       xvmaddadp               vs42,   vs2,    vs25
+       xvmaddadp               vs43,   vs3,    vs25
+
+       lxvdsx  vs28,   0,      BO
+       lxvdsx  vs29,   o8,     BO
+
+       xvmaddadp               vs48,   vs0,    vs26
+       xvmaddadp               vs49,   vs1,    vs26
+
+       lxvd2x  vs10,   o32,    AO
+       lxvd2x  vs11,   o48,    AO
+
+       xvmaddadp               vs50,   vs2,    vs26
+       xvmaddadp               vs51,   vs3,    vs26
+
+       lxvdsx  vs30,   o16,    BO
+       lxvdsx  vs31,   o24,    BO
+
+       xvmaddadp               vs56,   vs0,    vs27
+       xvmaddadp               vs57,   vs1,    vs27
+       xvmaddadp               vs58,   vs2,    vs27
+       xvmaddadp               vs59,   vs3,    vs27
+
+       addi            AO, AO, 64
+       addi            BO, BO, 32
+
+.endm
+
+.macro KERNEL4x8_2
+
+       xvmaddadp               vs32,   vs8,    vs28
+       xvmaddadp               vs33,   vs9,    vs28
+       xvmaddadp               vs34,   vs10,   vs28
+       xvmaddadp               vs35,   vs11,   vs28
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+
+       xvmaddadp               vs40,   vs8,    vs29
+       xvmaddadp               vs41,   vs9,    vs29
+       xvmaddadp               vs42,   vs10,   vs29
+       xvmaddadp               vs43,   vs11,   vs29
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+
+       xvmaddadp               vs48,   vs8,    vs30
+       xvmaddadp               vs49,   vs9,    vs30
+
+       lxvd2x  vs2,    o32,    AO
+       lxvd2x  vs3,    o48,    AO
+
+       xvmaddadp               vs50,   vs10,   vs30
+       xvmaddadp               vs51,   vs11,   vs30
+
+       lxvdsx  vs26,   o16,    BO
+       lxvdsx  vs27,   o24,    BO
+
+       xvmaddadp               vs56,   vs8,    vs31
+       xvmaddadp               vs57,   vs9,    vs31
+       xvmaddadp               vs58,   vs10,   vs31
+       xvmaddadp               vs59,   vs11,   vs31
+
+       addi            AO, AO, 64
+       addi            BO, BO, 32
+
+.endm
+
+.macro KERNEL4x8_E2
+
+
+       xvmaddadp               vs32,   vs8,    vs28
+       xvmaddadp               vs33,   vs9,    vs28
+       xvmaddadp               vs34,   vs10,   vs28
+       xvmaddadp               vs35,   vs11,   vs28
+
+       xvmaddadp               vs40,   vs8,    vs29
+       xvmaddadp               vs41,   vs9,    vs29
+       xvmaddadp               vs42,   vs10,   vs29
+       xvmaddadp               vs43,   vs11,   vs29
+
+       xvmaddadp               vs48,   vs8,    vs30
+       xvmaddadp               vs49,   vs9,    vs30
+       xvmaddadp               vs50,   vs10,   vs30
+       xvmaddadp               vs51,   vs11,   vs30
+
+       xvmaddadp               vs56,   vs8,    vs31
+       xvmaddadp               vs57,   vs9,    vs31
+       xvmaddadp               vs58,   vs10,   vs31
+       xvmaddadp               vs59,   vs11,   vs31
+
+.endm
+
+.macro KERNEL4x8_SUBI1
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+       lxvd2x  vs2,    o32,    AO
+       lxvd2x  vs3,    o48,    AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+       lxvdsx  vs26,   o16,    BO
+       lxvdsx  vs27,   o24,    BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 32
+
+
+       xvmuldp                 vs32,   vs0,    vs24
+       xvmuldp                 vs33,   vs1,    vs24
+       xvmuldp                 vs34,   vs2,    vs24
+       xvmuldp                 vs35,   vs3,    vs24
+
+       xvmuldp                 vs40,   vs0,    vs25
+       xvmuldp                 vs41,   vs1,    vs25
+       xvmuldp                 vs42,   vs2,    vs25
+       xvmuldp                 vs43,   vs3,    vs25
+
+       xvmuldp                 vs48,   vs0,    vs26
+       xvmuldp                 vs49,   vs1,    vs26
+       xvmuldp                 vs50,   vs2,    vs26
+       xvmuldp                 vs51,   vs3,    vs26
+
+       xvmuldp                 vs56,   vs0,    vs27
+       xvmuldp                 vs57,   vs1,    vs27
+       xvmuldp                 vs58,   vs2,    vs27
+       xvmuldp                 vs59,   vs3,    vs27
+
+.endm
+
+.macro KERNEL4x8_SUB1
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+       lxvd2x  vs2,    o32,    AO
+       lxvd2x  vs3,    o48,    AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+       lxvdsx  vs26,   o16,    BO
+       lxvdsx  vs27,   o24,    BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 32
+
+
+       xvmaddadp               vs32,   vs0,    vs24
+       xvmaddadp               vs33,   vs1,    vs24
+       xvmaddadp               vs34,   vs2,    vs24
+       xvmaddadp               vs35,   vs3,    vs24
+
+       xvmaddadp               vs40,   vs0,    vs25
+       xvmaddadp               vs41,   vs1,    vs25
+       xvmaddadp               vs42,   vs2,    vs25
+       xvmaddadp               vs43,   vs3,    vs25
+
+       xvmaddadp               vs48,   vs0,    vs26
+       xvmaddadp               vs49,   vs1,    vs26
+       xvmaddadp               vs50,   vs2,    vs26
+       xvmaddadp               vs51,   vs3,    vs26
+
+       xvmaddadp               vs56,   vs0,    vs27
+       xvmaddadp               vs57,   vs1,    vs27
+       xvmaddadp               vs58,   vs2,    vs27
+       xvmaddadp               vs59,   vs3,    vs27
+
+.endm
+
+.macro SAVE4x8
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+       lxvd2x          vs0,    0,      T1
+       lxvd2x          vs1,    o16,    T1
+       lxvd2x          vs2,    o32,    T1
+       lxvd2x          vs3,    o48,    T1
+#endif
+
+#ifndef TRMMKERNEL
+       xvmaddadp       vs0,    vs32,   alpha_r
+       xvmaddadp       vs1,    vs33,   alpha_r
+       xvmaddadp       vs2,    vs34,   alpha_r
+       xvmaddadp       vs3,    vs35,   alpha_r
+#else
+       xvmuldp         vs0,    vs32,   alpha_r
+       xvmuldp         vs1,    vs33,   alpha_r
+       xvmuldp         vs2,    vs34,   alpha_r
+       xvmuldp         vs3,    vs35,   alpha_r
+#endif
+
+       stxvd2x         vs0,    0,      T1
+       stxvd2x         vs1,    o16,    T1
+       stxvd2x         vs2,    o32,    T1
+       stxvd2x         vs3,    o48,    T1
+
+       add             T1,     T1,     LDC
+
+#ifndef TRMMKERNEL
+       lxvd2x          vs8,    0,      T1
+       lxvd2x          vs9,    o16,    T1
+       lxvd2x          vs10,   o32,    T1
+       lxvd2x          vs11,   o48,    T1
+#endif
+
+#ifndef TRMMKERNEL
+       xvmaddadp       vs8,    vs40,   alpha_r
+       xvmaddadp       vs9,    vs41,   alpha_r
+       xvmaddadp       vs10,   vs42,   alpha_r
+       xvmaddadp       vs11,   vs43,   alpha_r
+#else
+       xvmuldp         vs8,    vs40,   alpha_r
+       xvmuldp         vs9,    vs41,   alpha_r
+       xvmuldp         vs10,   vs42,   alpha_r
+       xvmuldp         vs11,   vs43,   alpha_r
+#endif
+
+       stxvd2x         vs8,    0,      T1
+       stxvd2x         vs9,    o16,    T1
+       stxvd2x         vs10,   o32,    T1
+       stxvd2x         vs11,   o48,    T1
+
+       add             T1,     T1,     LDC
+
+#ifndef TRMMKERNEL
+       lxvd2x          vs0,    0,      T1
+       lxvd2x          vs1,    o16,    T1
+       lxvd2x          vs2,    o32,    T1
+       lxvd2x          vs3,    o48,    T1
+#endif
+
+#ifndef TRMMKERNEL
+       xvmaddadp       vs0,    vs48,   alpha_r
+       xvmaddadp       vs1,    vs49,   alpha_r
+       xvmaddadp       vs2,    vs50,   alpha_r
+       xvmaddadp       vs3,    vs51,   alpha_r
+#else
+       xvmuldp         vs0,    vs48,   alpha_r
+       xvmuldp         vs1,    vs49,   alpha_r
+       xvmuldp         vs2,    vs50,   alpha_r
+       xvmuldp         vs3,    vs51,   alpha_r
+#endif
+
+       stxvd2x         vs0,    0,      T1
+       stxvd2x         vs1,    o16,    T1
+       stxvd2x         vs2,    o32,    T1
+       stxvd2x         vs3,    o48,    T1
+
+       add             T1,     T1,     LDC
+
+#ifndef TRMMKERNEL
+       lxvd2x          vs8,    0,      T1
+       lxvd2x          vs9,    o16,    T1
+       lxvd2x          vs10,   o32,    T1
+       lxvd2x          vs11,   o48,    T1
+#endif
+
+#ifndef TRMMKERNEL
+       xvmaddadp       vs8,    vs56,   alpha_r
+       xvmaddadp       vs9,    vs57,   alpha_r
+       xvmaddadp       vs10,   vs58,   alpha_r
+       xvmaddadp       vs11,   vs59,   alpha_r
+#else
+       xvmuldp         vs8,    vs56,   alpha_r
+       xvmuldp         vs9,    vs57,   alpha_r
+       xvmuldp         vs10,   vs58,   alpha_r
+       xvmuldp         vs11,   vs59,   alpha_r
+#endif
+
+       stxvd2x         vs8,    0,      T1
+       stxvd2x         vs9,    o16,    T1
+       stxvd2x         vs10,   o32,    T1
+       stxvd2x         vs11,   o48,    T1
+
+       addi            CO,     CO,     64
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=4                                                *
+*********************************************************************/
+
+.macro LOAD4x4_1
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+       lxvdsx  vs26,   o16,    BO
+       lxvdsx  vs27,   o24,    BO
+
+       addi            AO, AO, 32
+       addi            BO, BO, 32
+
+.endm
+
+.macro KERNEL4x4_I1
+
+       lxvd2x  vs8,    0,      AO
+       lxvd2x  vs9,    o16,    AO
+
+       lxvdsx  vs28,   0,      BO
+       lxvdsx  vs29,   o8,     BO
+       lxvdsx  vs30,   o16,    BO
+       lxvdsx  vs31,   o24,    BO
+
+       addi            AO, AO, 32
+       addi            BO, BO, 32
+
+
+       xvmuldp                 vs32,   vs0,    vs24
+       xvmuldp                 vs33,   vs1,    vs24
+
+       xvmuldp                 vs40,   vs0,    vs25
+       xvmuldp                 vs41,   vs1,    vs25
+
+       xvmuldp                 vs48,   vs0,    vs26
+       xvmuldp                 vs49,   vs1,    vs26
+
+       xvmuldp                 vs56,   vs0,    vs27
+       xvmuldp                 vs57,   vs1,    vs27
+
+.endm
+
+.macro KERNEL4x4_1
+
+       lxvd2x  vs8,    0,      AO
+       lxvd2x  vs9,    o16,    AO
+
+       lxvdsx  vs28,   0,      BO
+       lxvdsx  vs29,   o8,     BO
+       lxvdsx  vs30,   o16,    BO
+       lxvdsx  vs31,   o24,    BO
+
+       addi            AO, AO, 32
+       addi            BO, BO, 32
+
+
+       xvmaddadp               vs32,   vs0,    vs24
+       xvmaddadp               vs33,   vs1,    vs24
+
+       xvmaddadp               vs40,   vs0,    vs25
+       xvmaddadp               vs41,   vs1,    vs25
+
+       xvmaddadp               vs48,   vs0,    vs26
+       xvmaddadp               vs49,   vs1,    vs26
+
+       xvmaddadp               vs56,   vs0,    vs27
+       xvmaddadp               vs57,   vs1,    vs27
+
+.endm
+
+.macro KERNEL4x4_2
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+       lxvdsx  vs26,   o16,    BO
+       lxvdsx  vs27,   o24,    BO
+
+       addi            AO, AO, 32
+       addi            BO, BO, 32
+
+
+       xvmaddadp               vs32,   vs8,    vs28
+       xvmaddadp               vs33,   vs9,    vs28
+
+       xvmaddadp               vs40,   vs8,    vs29
+       xvmaddadp               vs41,   vs9,    vs29
+
+       xvmaddadp               vs48,   vs8,    vs30
+       xvmaddadp               vs49,   vs9,    vs30
+
+       xvmaddadp               vs56,   vs8,    vs31
+       xvmaddadp               vs57,   vs9,    vs31
+
+.endm
+
+.macro KERNEL4x4_E2
+
+
+       xvmaddadp               vs32,   vs8,    vs28
+       xvmaddadp               vs33,   vs9,    vs28
+
+       xvmaddadp               vs40,   vs8,    vs29
+       xvmaddadp               vs41,   vs9,    vs29
+
+       xvmaddadp               vs48,   vs8,    vs30
+       xvmaddadp               vs49,   vs9,    vs30
+
+       xvmaddadp               vs56,   vs8,    vs31
+       xvmaddadp               vs57,   vs9,    vs31
+
+.endm
+
+.macro KERNEL4x4_SUBI1
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+       lxvdsx  vs26,   o16,    BO
+       lxvdsx  vs27,   o24,    BO
+
+       addi            AO, AO, 32
+       addi            BO, BO, 32
+
+
+       xvmuldp                 vs32,   vs0,    vs24
+       xvmuldp                 vs33,   vs1,    vs24
+
+       xvmuldp                 vs40,   vs0,    vs25
+       xvmuldp                 vs41,   vs1,    vs25
+
+       xvmuldp                 vs48,   vs0,    vs26
+       xvmuldp                 vs49,   vs1,    vs26
+
+       xvmuldp                 vs56,   vs0,    vs27
+       xvmuldp                 vs57,   vs1,    vs27
+
+.endm
+
+.macro KERNEL4x4_SUB1
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+       lxvdsx  vs26,   o16,    BO
+       lxvdsx  vs27,   o24,    BO
+
+       addi            AO, AO, 32
+       addi            BO, BO, 32
+
+
+       xvmaddadp               vs32,   vs0,    vs24
+       xvmaddadp               vs33,   vs1,    vs24
+
+       xvmaddadp               vs40,   vs0,    vs25
+       xvmaddadp               vs41,   vs1,    vs25
+
+       xvmaddadp               vs48,   vs0,    vs26
+       xvmaddadp               vs49,   vs1,    vs26
+
+       xvmaddadp               vs56,   vs0,    vs27
+       xvmaddadp               vs57,   vs1,    vs27
+
+.endm
+
+.macro SAVE4x4
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+       lxvd2x          vs0,    0,      T1
+       lxvd2x          vs1,    o16,    T1
+#endif
+
+#ifndef TRMMKERNEL
+       xvmaddadp       vs0,    vs32,   alpha_r
+       xvmaddadp       vs1,    vs33,   alpha_r
+#else
+       xvmuldp         vs0,    vs32,   alpha_r
+       xvmuldp         vs1,    vs33,   alpha_r
+#endif
+
+       stxvd2x         vs0,    0,      T1
+       stxvd2x         vs1,    o16,    T1
+
+       add             T1,     T1,     LDC
+
+#ifndef TRMMKERNEL
+       lxvd2x          vs8,    0,      T1
+       lxvd2x          vs9,    o16,    T1
+#endif
+
+#ifndef TRMMKERNEL
+       xvmaddadp       vs8,    vs40,   alpha_r
+       xvmaddadp       vs9,    vs41,   alpha_r
+#else
+       xvmuldp         vs8,    vs40,   alpha_r
+       xvmuldp         vs9,    vs41,   alpha_r
+#endif
+
+       stxvd2x         vs8,    0,      T1
+       stxvd2x         vs9,    o16,    T1
+
+       add             T1,     T1,     LDC
+
+#ifndef TRMMKERNEL
+       lxvd2x          vs0,    0,      T1
+       lxvd2x          vs1,    o16,    T1
+#endif
+
+#ifndef TRMMKERNEL
+       xvmaddadp       vs0,    vs48,   alpha_r
+       xvmaddadp       vs1,    vs49,   alpha_r
+#else
+       xvmuldp         vs0,    vs48,   alpha_r
+       xvmuldp         vs1,    vs49,   alpha_r
+#endif
+
+       stxvd2x         vs0,    0,      T1
+       stxvd2x         vs1,    o16,    T1
+
+       add             T1,     T1,     LDC
+
+#ifndef TRMMKERNEL
+       lxvd2x          vs8,    0,      T1
+       lxvd2x          vs9,    o16,    T1
+#endif
+
+#ifndef TRMMKERNEL
+       xvmaddadp       vs8,    vs56,   alpha_r
+       xvmaddadp       vs9,    vs57,   alpha_r
+#else
+       xvmuldp         vs8,    vs56,   alpha_r
+       xvmuldp         vs9,    vs57,   alpha_r
+#endif
+
+       stxvd2x         vs8,    0,      T1
+       stxvd2x         vs9,    o16,    T1
+
+       addi            CO,     CO,     32
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=2                                                *
+*********************************************************************/
+
+.macro LOAD4x2_1
+
+       lxvd2x  vs0,    0,      AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+       lxvdsx  vs26,   o16,    BO
+       lxvdsx  vs27,   o24,    BO
+
+       addi            AO, AO, 16
+       addi            BO, BO, 32
+
+.endm
+
+.macro KERNEL4x2_I1
+
+       lxvd2x  vs8,    0,      AO
+
+       lxvdsx  vs28,   0,      BO
+       lxvdsx  vs29,   o8,     BO
+       lxvdsx  vs30,   o16,    BO
+       lxvdsx  vs31,   o24,    BO
+
+       addi            AO, AO, 16
+       addi            BO, BO, 32
+
+
+       xvmuldp                 vs32,   vs0,    vs24
+
+       xvmuldp                 vs40,   vs0,    vs25
+
+       xvmuldp                 vs48,   vs0,    vs26
+
+       xvmuldp                 vs56,   vs0,    vs27
+
+.endm
+
+.macro KERNEL4x2_1
+
+       lxvd2x  vs8,    0,      AO
+
+       lxvdsx  vs28,   0,      BO
+       lxvdsx  vs29,   o8,     BO
+       lxvdsx  vs30,   o16,    BO
+       lxvdsx  vs31,   o24,    BO
+
+       addi            AO, AO, 16
+       addi            BO, BO, 32
+
+
+       xvmaddadp               vs32,   vs0,    vs24
+
+       xvmaddadp               vs40,   vs0,    vs25
+
+       xvmaddadp               vs48,   vs0,    vs26
+
+       xvmaddadp               vs56,   vs0,    vs27
+
+.endm
+
+.macro KERNEL4x2_2
+
+       lxvd2x  vs0,    0,      AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+       lxvdsx  vs26,   o16,    BO
+       lxvdsx  vs27,   o24,    BO
+
+       addi            AO, AO, 16
+       addi            BO, BO, 32
+
+
+       xvmaddadp               vs32,   vs8,    vs28
+
+       xvmaddadp               vs40,   vs8,    vs29
+
+       xvmaddadp               vs48,   vs8,    vs30
+
+       xvmaddadp               vs56,   vs8,    vs31
+
+.endm
+
+.macro KERNEL4x2_E2
+
+
+       xvmaddadp               vs32,   vs8,    vs28
+
+       xvmaddadp               vs40,   vs8,    vs29
+
+       xvmaddadp               vs48,   vs8,    vs30
+
+       xvmaddadp               vs56,   vs8,    vs31
+
+.endm
+
+.macro KERNEL4x2_SUBI1
+
+       lxvd2x  vs0,    0,      AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+       lxvdsx  vs26,   o16,    BO
+       lxvdsx  vs27,   o24,    BO
+
+       addi            AO, AO, 16
+       addi            BO, BO, 32
+
+
+       xvmuldp                 vs32,   vs0,    vs24
+
+       xvmuldp                 vs40,   vs0,    vs25
+
+       xvmuldp                 vs48,   vs0,    vs26
+
+       xvmuldp                 vs56,   vs0,    vs27
+
+.endm
+
+.macro KERNEL4x2_SUB1
+
+       lxvd2x  vs0,    0,      AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+       lxvdsx  vs26,   o16,    BO
+       lxvdsx  vs27,   o24,    BO
+
+       addi            AO, AO, 16
+       addi            BO, BO, 32
+
+
+       xvmaddadp               vs32,   vs0,    vs24
+
+       xvmaddadp               vs40,   vs0,    vs25
+
+       xvmaddadp               vs48,   vs0,    vs26
+
+       xvmaddadp               vs56,   vs0,    vs27
+
+.endm
+
+.macro SAVE4x2
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+       lxvd2x          vs0,    0,      T1
+#endif
+
+#ifndef TRMMKERNEL
+       xvmaddadp       vs0,    vs32,   alpha_r
+#else
+       xvmuldp         vs0,    vs32,   alpha_r
+#endif
+
+       stxvd2x         vs0,    0,      T1
+
+       add             T1,     T1,     LDC
+
+#ifndef TRMMKERNEL
+       lxvd2x          vs8,    0,      T1
+#endif
+
+#ifndef TRMMKERNEL
+       xvmaddadp       vs8,    vs40,   alpha_r
+#else
+       xvmuldp         vs8,    vs40,   alpha_r
+#endif
+
+       stxvd2x         vs8,    0,      T1
+
+       add             T1,     T1,     LDC
+
+#ifndef TRMMKERNEL
+       lxvd2x          vs0,    0,      T1
+#endif
+
+#ifndef TRMMKERNEL
+       xvmaddadp       vs0,    vs48,   alpha_r
+#else
+       xvmuldp         vs0,    vs48,   alpha_r
+#endif
+
+       stxvd2x         vs0,    0,      T1
+
+       add             T1,     T1,     LDC
+
+#ifndef TRMMKERNEL
+       lxvd2x          vs8,    0,      T1
+#endif
+
+#ifndef TRMMKERNEL
+       xvmaddadp       vs8,    vs56,   alpha_r
+#else
+       xvmuldp         vs8,    vs56,   alpha_r
+#endif
+
+       stxvd2x         vs8,    0,      T1
+
+       addi            CO,     CO,     16
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=1                                                *
+*********************************************************************/
+
+.macro LOAD4x1_1
+
+       lxsdx   vs0,    0,      AO
+
+       lxsdx   vs24,   0,      BO
+       lxsdx   vs25,   o8,     BO
+       lxsdx   vs26,   o16,    BO
+       lxsdx   vs27,   o24,    BO
+
+       addi            AO, AO, 8
+       addi            BO, BO, 32
+
+.endm
+
+.macro KERNEL4x1_I1
+
+       lxsdx   vs8,    0,      AO
+
+       lxsdx   vs28,   0,      BO
+       lxsdx   vs29,   o8,     BO
+       lxsdx   vs30,   o16,    BO
+       lxsdx   vs31,   o24,    BO
+
+       addi            AO, AO, 8
+       addi            BO, BO, 32
+
+
+       xsmuldp                 vs32,   vs0,    vs24
+
+       xsmuldp                 vs40,   vs0,    vs25
+
+       xsmuldp                 vs48,   vs0,    vs26
+
+       xsmuldp                 vs56,   vs0,    vs27
+
+.endm
+
+.macro KERNEL4x1_1
+
+       lxsdx   vs8,    0,      AO
+
+       lxsdx   vs28,   0,      BO
+       lxsdx   vs29,   o8,     BO
+       lxsdx   vs30,   o16,    BO
+       lxsdx   vs31,   o24,    BO
+
+       addi            AO, AO, 8
+       addi            BO, BO, 32
+
+
+       xsmaddadp               vs32,   vs0,    vs24
+
+       xsmaddadp               vs40,   vs0,    vs25
+
+       xsmaddadp               vs48,   vs0,    vs26
+
+       xsmaddadp               vs56,   vs0,    vs27
+
+.endm
+
+.macro KERNEL4x1_2
+
+       lxsdx   vs0,    0,      AO
+
+       lxsdx   vs24,   0,      BO
+       lxsdx   vs25,   o8,     BO
+       lxsdx   vs26,   o16,    BO
+       lxsdx   vs27,   o24,    BO
+
+       addi            AO, AO, 8
+       addi            BO, BO, 32
+
+
+       xsmaddadp               vs32,   vs8,    vs28
+
+       xsmaddadp               vs40,   vs8,    vs29
+
+       xsmaddadp               vs48,   vs8,    vs30
+
+       xsmaddadp               vs56,   vs8,    vs31
+
+.endm
+
+.macro KERNEL4x1_E2
+
+
+       xsmaddadp               vs32,   vs8,    vs28
+
+       xsmaddadp               vs40,   vs8,    vs29
+
+       xsmaddadp               vs48,   vs8,    vs30
+
+       xsmaddadp               vs56,   vs8,    vs31
+
+.endm
+
+.macro KERNEL4x1_SUBI1
+
+       lxsdx   vs0,    0,      AO
+
+       lxsdx   vs24,   0,      BO
+       lxsdx   vs25,   o8,     BO
+       lxsdx   vs26,   o16,    BO
+       lxsdx   vs27,   o24,    BO
+
+       addi            AO, AO, 8
+       addi            BO, BO, 32
+
+
+       xsmuldp                 vs32,   vs0,    vs24
+
+       xsmuldp                 vs40,   vs0,    vs25
+
+       xsmuldp                 vs48,   vs0,    vs26
+
+       xsmuldp                 vs56,   vs0,    vs27
+
+.endm
+
+.macro KERNEL4x1_SUB1
+
+       lxsdx   vs0,    0,      AO
+
+       lxsdx   vs24,   0,      BO
+       lxsdx   vs25,   o8,     BO
+       lxsdx   vs26,   o16,    BO
+       lxsdx   vs27,   o24,    BO
+
+       addi            AO, AO, 8
+       addi            BO, BO, 32
+
+
+       xsmaddadp               vs32,   vs0,    vs24
+
+       xsmaddadp               vs40,   vs0,    vs25
+
+       xsmaddadp               vs48,   vs0,    vs26
+
+       xsmaddadp               vs56,   vs0,    vs27
+
+.endm
+
+.macro SAVE4x1
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+       lxsdx           vs0,    0,      T1
+#endif
+
+#ifndef TRMMKERNEL
+       xsmaddadp       vs0,    vs32,   alpha_r
+#else
+       xsmuldp         vs0,    vs32,   alpha_r
+#endif
+
+       stxsdx          vs0,    0,      T1
+
+       add             T1,     T1,     LDC
+
+#ifndef TRMMKERNEL
+       lxsdx           vs8,    0,      T1
+#endif
+
+#ifndef TRMMKERNEL
+       xsmaddadp       vs8,    vs40,   alpha_r
+#else
+       xsmuldp         vs8,    vs40,   alpha_r
+#endif
+
+       stxsdx          vs8,    0,      T1
+
+       add             T1,     T1,     LDC
+
+#ifndef TRMMKERNEL
+       lxsdx           vs0,    0,      T1
+#endif
+
+#ifndef TRMMKERNEL
+       xsmaddadp       vs0,    vs48,   alpha_r
+#else
+       xsmuldp         vs0,    vs48,   alpha_r
+#endif
+
+       stxsdx          vs0,    0,      T1
+
+       add             T1,     T1,     LDC
+
+#ifndef TRMMKERNEL
+       lxsdx           vs8,    0,      T1
+#endif
+
+#ifndef TRMMKERNEL
+       xsmaddadp       vs8,    vs56,   alpha_r
+#else
+       xsmuldp         vs8,    vs56,   alpha_r
+#endif
+
+       stxsdx          vs8,    0,      T1
+
+       addi            CO,     CO,     8
+
+.endm
+
+/*********************************************************************
+* Macros for N=2, M=16                                               *
+*********************************************************************/
+
+.macro LOAD2x16_1
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+       lxvd2x  vs2,    o32,    AO
+       lxvd2x  vs3,    o48,    AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 16
+
+       lxvd2x  vs4,    0,      AO
+       lxvd2x  vs5,    o16,    AO
+       lxvd2x  vs6,    o32,    AO
+       lxvd2x  vs7,    o48,    AO
+
+       addi            AO, AO, 64
+
+.endm
+
+.macro KERNEL2x16_I1
+
+       lxvd2x  vs8,    0,      AO
+       lxvd2x  vs9,    o16,    AO
+       lxvd2x  vs10,   o32,    AO
+       lxvd2x  vs11,   o48,    AO
+
+       lxvdsx  vs28,   0,      BO
+       lxvdsx  vs29,   o8,     BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 16
+
+       lxvd2x  vs12,   0,      AO
+       lxvd2x  vs13,   o16,    AO
+       lxvd2x  vs14,   o32,    AO
+       lxvd2x  vs15,   o48,    AO
+
+       addi            AO, AO, 64
+
+
+       xvmuldp                 vs32,   vs0,    vs24
+       xvmuldp                 vs33,   vs1,    vs24
+       xvmuldp                 vs34,   vs2,    vs24
+       xvmuldp                 vs35,   vs3,    vs24
+       xvmuldp                 vs36,   vs4,    vs24
+       xvmuldp                 vs37,   vs5,    vs24
+       xvmuldp                 vs38,   vs6,    vs24
+       xvmuldp                 vs39,   vs7,    vs24
+
+       xvmuldp                 vs40,   vs0,    vs25
+       xvmuldp                 vs41,   vs1,    vs25
+       xvmuldp                 vs42,   vs2,    vs25
+       xvmuldp                 vs43,   vs3,    vs25
+       xvmuldp                 vs44,   vs4,    vs25
+       xvmuldp                 vs45,   vs5,    vs25
+       xvmuldp                 vs46,   vs6,    vs25
+       xvmuldp                 vs47,   vs7,    vs25
+
+.endm
+
+.macro KERNEL2x16_1
+
+       lxvd2x  vs8,    0,      AO
+       lxvd2x  vs9,    o16,    AO
+       lxvd2x  vs10,   o32,    AO
+       lxvd2x  vs11,   o48,    AO
+
+       lxvdsx  vs28,   0,      BO
+       lxvdsx  vs29,   o8,     BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 16
+
+       lxvd2x  vs12,   0,      AO
+       lxvd2x  vs13,   o16,    AO
+       lxvd2x  vs14,   o32,    AO
+       lxvd2x  vs15,   o48,    AO
+
+       addi            AO, AO, 64
+
+
+       xvmaddadp               vs32,   vs0,    vs24
+       xvmaddadp               vs33,   vs1,    vs24
+       xvmaddadp               vs34,   vs2,    vs24
+       xvmaddadp               vs35,   vs3,    vs24
+       xvmaddadp               vs36,   vs4,    vs24
+       xvmaddadp               vs37,   vs5,    vs24
+       xvmaddadp               vs38,   vs6,    vs24
+       xvmaddadp               vs39,   vs7,    vs24
+
+       xvmaddadp               vs40,   vs0,    vs25
+       xvmaddadp               vs41,   vs1,    vs25
+       xvmaddadp               vs42,   vs2,    vs25
+       xvmaddadp               vs43,   vs3,    vs25
+       xvmaddadp               vs44,   vs4,    vs25
+       xvmaddadp               vs45,   vs5,    vs25
+       xvmaddadp               vs46,   vs6,    vs25
+       xvmaddadp               vs47,   vs7,    vs25
+
+.endm
+
+.macro KERNEL2x16_2
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+       lxvd2x  vs2,    o32,    AO
+       lxvd2x  vs3,    o48,    AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 16
+
+       lxvd2x  vs4,    0,      AO
+       lxvd2x  vs5,    o16,    AO
+       lxvd2x  vs6,    o32,    AO
+       lxvd2x  vs7,    o48,    AO
+
+       addi            AO, AO, 64
+
+
+       xvmaddadp               vs32,   vs8,    vs28
+       xvmaddadp               vs33,   vs9,    vs28
+       xvmaddadp               vs34,   vs10,   vs28
+       xvmaddadp               vs35,   vs11,   vs28
+       xvmaddadp               vs36,   vs12,   vs28
+       xvmaddadp               vs37,   vs13,   vs28
+       xvmaddadp               vs38,   vs14,   vs28
+       xvmaddadp               vs39,   vs15,   vs28
+
+       xvmaddadp               vs40,   vs8,    vs29
+       xvmaddadp               vs41,   vs9,    vs29
+       xvmaddadp               vs42,   vs10,   vs29
+       xvmaddadp               vs43,   vs11,   vs29
+       xvmaddadp               vs44,   vs12,   vs29
+       xvmaddadp               vs45,   vs13,   vs29
+       xvmaddadp               vs46,   vs14,   vs29
+       xvmaddadp               vs47,   vs15,   vs29
+
+.endm
+
+.macro KERNEL2x16_E2
+
+
+       xvmaddadp               vs32,   vs8,    vs28
+       xvmaddadp               vs33,   vs9,    vs28
+       xvmaddadp               vs34,   vs10,   vs28
+       xvmaddadp               vs35,   vs11,   vs28
+       xvmaddadp               vs36,   vs12,   vs28
+       xvmaddadp               vs37,   vs13,   vs28
+       xvmaddadp               vs38,   vs14,   vs28
+       xvmaddadp               vs39,   vs15,   vs28
+
+       xvmaddadp               vs40,   vs8,    vs29
+       xvmaddadp               vs41,   vs9,    vs29
+       xvmaddadp               vs42,   vs10,   vs29
+       xvmaddadp               vs43,   vs11,   vs29
+       xvmaddadp               vs44,   vs12,   vs29
+       xvmaddadp               vs45,   vs13,   vs29
+       xvmaddadp               vs46,   vs14,   vs29
+       xvmaddadp               vs47,   vs15,   vs29
+
+.endm
+
+.macro KERNEL2x16_SUBI1
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+       lxvd2x  vs2,    o32,    AO
+       lxvd2x  vs3,    o48,    AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 16
+
+       lxvd2x  vs4,    0,      AO
+       lxvd2x  vs5,    o16,    AO
+       lxvd2x  vs6,    o32,    AO
+       lxvd2x  vs7,    o48,    AO
+
+       addi            AO, AO, 64
+
+
+       xvmuldp                 vs32,   vs0,    vs24
+       xvmuldp                 vs33,   vs1,    vs24
+       xvmuldp                 vs34,   vs2,    vs24
+       xvmuldp                 vs35,   vs3,    vs24
+       xvmuldp                 vs36,   vs4,    vs24
+       xvmuldp                 vs37,   vs5,    vs24
+       xvmuldp                 vs38,   vs6,    vs24
+       xvmuldp                 vs39,   vs7,    vs24
+
+       xvmuldp                 vs40,   vs0,    vs25
+       xvmuldp                 vs41,   vs1,    vs25
+       xvmuldp                 vs42,   vs2,    vs25
+       xvmuldp                 vs43,   vs3,    vs25
+       xvmuldp                 vs44,   vs4,    vs25
+       xvmuldp                 vs45,   vs5,    vs25
+       xvmuldp                 vs46,   vs6,    vs25
+       xvmuldp                 vs47,   vs7,    vs25
+
+.endm
+
+.macro KERNEL2x16_SUB1
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+       lxvd2x  vs2,    o32,    AO
+       lxvd2x  vs3,    o48,    AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 16
+
+       lxvd2x  vs4,    0,      AO
+       lxvd2x  vs5,    o16,    AO
+       lxvd2x  vs6,    o32,    AO
+       lxvd2x  vs7,    o48,    AO
+
+       addi            AO, AO, 64
+
+
+       xvmaddadp               vs32,   vs0,    vs24
+       xvmaddadp               vs33,   vs1,    vs24
+       xvmaddadp               vs34,   vs2,    vs24
+       xvmaddadp               vs35,   vs3,    vs24
+       xvmaddadp               vs36,   vs4,    vs24
+       xvmaddadp               vs37,   vs5,    vs24
+       xvmaddadp               vs38,   vs6,    vs24
+       xvmaddadp               vs39,   vs7,    vs24
+
+       xvmaddadp               vs40,   vs0,    vs25
+       xvmaddadp               vs41,   vs1,    vs25
+       xvmaddadp               vs42,   vs2,    vs25
+       xvmaddadp               vs43,   vs3,    vs25
+       xvmaddadp               vs44,   vs4,    vs25
+       xvmaddadp               vs45,   vs5,    vs25
+       xvmaddadp               vs46,   vs6,    vs25
+       xvmaddadp               vs47,   vs7,    vs25
+
+.endm
+
+.macro SAVE2x16
+
+       mr              T1,     CO
+       addi            T2,     T1,     64
+
+#ifndef TRMMKERNEL
+       lxvd2x          vs0,    0,      T1
+       lxvd2x          vs1,    o16,    T1
+       lxvd2x          vs2,    o32,    T1
+       lxvd2x          vs3,    o48,    T1
+
+       lxvd2x          vs4,    0,      T2
+       lxvd2x          vs5,    o16,    T2
+       lxvd2x          vs6,    o32,    T2
+       lxvd2x          vs7,    o48,    T2
+#endif
+
+#ifndef TRMMKERNEL
+       xvmaddadp       vs0,    vs32,   alpha_r
+       xvmaddadp       vs1,    vs33,   alpha_r
+       xvmaddadp       vs2,    vs34,   alpha_r
+       xvmaddadp       vs3,    vs35,   alpha_r
+       xvmaddadp       vs4,    vs36,   alpha_r
+       xvmaddadp       vs5,    vs37,   alpha_r
+       xvmaddadp       vs6,    vs38,   alpha_r
+       xvmaddadp       vs7,    vs39,   alpha_r
+#else
+       xvmuldp         vs0,    vs32,   alpha_r
+       xvmuldp         vs1,    vs33,   alpha_r
+       xvmuldp         vs2,    vs34,   alpha_r
+       xvmuldp         vs3,    vs35,   alpha_r
+       xvmuldp         vs4,    vs36,   alpha_r
+       xvmuldp         vs5,    vs37,   alpha_r
+       xvmuldp         vs6,    vs38,   alpha_r
+       xvmuldp         vs7,    vs39,   alpha_r
+#endif
+
+       stxvd2x         vs0,    0,      T1
+       stxvd2x         vs1,    o16,    T1
+       stxvd2x         vs2,    o32,    T1
+       stxvd2x         vs3,    o48,    T1
+
+       stxvd2x         vs4,    0,      T2
+       stxvd2x         vs5,    o16,    T2
+       stxvd2x         vs6,    o32,    T2
+       stxvd2x         vs7,    o48,    T2
+
+       add             T1,     T1,     LDC
+       add             T2,     T2,     LDC
+
+#ifndef TRMMKERNEL
+       lxvd2x          vs8,    0,      T1
+       lxvd2x          vs9,    o16,    T1
+       lxvd2x          vs10,   o32,    T1
+       lxvd2x          vs11,   o48,    T1
+
+       lxvd2x          vs12,   0,      T2
+       lxvd2x          vs13,   o16,    T2
+       lxvd2x          vs14,   o32,    T2
+       lxvd2x          vs15,   o48,    T2
+#endif
+
+#ifndef TRMMKERNEL
+       xvmaddadp       vs8,    vs40,   alpha_r
+       xvmaddadp       vs9,    vs41,   alpha_r
+       xvmaddadp       vs10,   vs42,   alpha_r
+       xvmaddadp       vs11,   vs43,   alpha_r
+       xvmaddadp       vs12,   vs44,   alpha_r
+       xvmaddadp       vs13,   vs45,   alpha_r
+       xvmaddadp       vs14,   vs46,   alpha_r
+       xvmaddadp       vs15,   vs47,   alpha_r
+#else
+       xvmuldp         vs8,    vs40,   alpha_r
+       xvmuldp         vs9,    vs41,   alpha_r
+       xvmuldp         vs10,   vs42,   alpha_r
+       xvmuldp         vs11,   vs43,   alpha_r
+       xvmuldp         vs12,   vs44,   alpha_r
+       xvmuldp         vs13,   vs45,   alpha_r
+       xvmuldp         vs14,   vs46,   alpha_r
+       xvmuldp         vs15,   vs47,   alpha_r
+#endif
+
+       stxvd2x         vs8,    0,      T1
+       stxvd2x         vs9,    o16,    T1
+       stxvd2x         vs10,   o32,    T1
+       stxvd2x         vs11,   o48,    T1
+
+       stxvd2x         vs12,   0,      T2
+       stxvd2x         vs13,   o16,    T2
+       stxvd2x         vs14,   o32,    T2
+       stxvd2x         vs15,   o48,    T2
+
+       addi            CO,     CO,     128
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=8                                                *
+*********************************************************************/
+
+.macro LOAD2x8_1
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+       lxvd2x  vs2,    o32,    AO
+       lxvd2x  vs3,    o48,    AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 16
+
+.endm
+
+.macro KERNEL2x8_I1
+
+       lxvd2x  vs8,    0,      AO
+       lxvd2x  vs9,    o16,    AO
+       lxvd2x  vs10,   o32,    AO
+       lxvd2x  vs11,   o48,    AO
+
+       lxvdsx  vs28,   0,      BO
+       lxvdsx  vs29,   o8,     BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 16
+
+
+       xvmuldp                 vs32,   vs0,    vs24
+       xvmuldp                 vs33,   vs1,    vs24
+       xvmuldp                 vs34,   vs2,    vs24
+       xvmuldp                 vs35,   vs3,    vs24
+
+       xvmuldp                 vs40,   vs0,    vs25
+       xvmuldp                 vs41,   vs1,    vs25
+       xvmuldp                 vs42,   vs2,    vs25
+       xvmuldp                 vs43,   vs3,    vs25
+
+.endm
+
+.macro KERNEL2x8_1
+
+       lxvd2x  vs8,    0,      AO
+       lxvd2x  vs9,    o16,    AO
+       lxvd2x  vs10,   o32,    AO
+       lxvd2x  vs11,   o48,    AO
+
+       lxvdsx  vs28,   0,      BO
+       lxvdsx  vs29,   o8,     BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 16
+
+
+       xvmaddadp               vs32,   vs0,    vs24
+       xvmaddadp               vs33,   vs1,    vs24
+       xvmaddadp               vs34,   vs2,    vs24
+       xvmaddadp               vs35,   vs3,    vs24
+
+       xvmaddadp               vs40,   vs0,    vs25
+       xvmaddadp               vs41,   vs1,    vs25
+       xvmaddadp               vs42,   vs2,    vs25
+       xvmaddadp               vs43,   vs3,    vs25
+
+.endm
+
+.macro KERNEL2x8_2
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+       lxvd2x  vs2,    o32,    AO
+       lxvd2x  vs3,    o48,    AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 16
+
+
+       xvmaddadp               vs32,   vs8,    vs28
+       xvmaddadp               vs33,   vs9,    vs28
+       xvmaddadp               vs34,   vs10,   vs28
+       xvmaddadp               vs35,   vs11,   vs28
+
+       xvmaddadp               vs40,   vs8,    vs29
+       xvmaddadp               vs41,   vs9,    vs29
+       xvmaddadp               vs42,   vs10,   vs29
+       xvmaddadp               vs43,   vs11,   vs29
+
+.endm
+
+.macro KERNEL2x8_E2
+
+
+       xvmaddadp               vs32,   vs8,    vs28
+       xvmaddadp               vs33,   vs9,    vs28
+       xvmaddadp               vs34,   vs10,   vs28
+       xvmaddadp               vs35,   vs11,   vs28
+
+       xvmaddadp               vs40,   vs8,    vs29
+       xvmaddadp               vs41,   vs9,    vs29
+       xvmaddadp               vs42,   vs10,   vs29
+       xvmaddadp               vs43,   vs11,   vs29
+
+.endm
+
+.macro KERNEL2x8_SUBI1
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+       lxvd2x  vs2,    o32,    AO
+       lxvd2x  vs3,    o48,    AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 16
+
+
+       xvmuldp                 vs32,   vs0,    vs24
+       xvmuldp                 vs33,   vs1,    vs24
+       xvmuldp                 vs34,   vs2,    vs24
+       xvmuldp                 vs35,   vs3,    vs24
+
+       xvmuldp                 vs40,   vs0,    vs25
+       xvmuldp                 vs41,   vs1,    vs25
+       xvmuldp                 vs42,   vs2,    vs25
+       xvmuldp                 vs43,   vs3,    vs25
+
+.endm
+
+.macro KERNEL2x8_SUB1
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+       lxvd2x  vs2,    o32,    AO
+       lxvd2x  vs3,    o48,    AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 16
+
+
+       xvmaddadp               vs32,   vs0,    vs24
+       xvmaddadp               vs33,   vs1,    vs24
+       xvmaddadp               vs34,   vs2,    vs24
+       xvmaddadp               vs35,   vs3,    vs24
+
+       xvmaddadp               vs40,   vs0,    vs25
+       xvmaddadp               vs41,   vs1,    vs25
+       xvmaddadp               vs42,   vs2,    vs25
+       xvmaddadp               vs43,   vs3,    vs25
+
+.endm
+
+.macro SAVE2x8
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+       lxvd2x          vs0,    0,      T1
+       lxvd2x          vs1,    o16,    T1
+       lxvd2x          vs2,    o32,    T1
+       lxvd2x          vs3,    o48,    T1
+#endif
+
+#ifndef TRMMKERNEL
+       xvmaddadp       vs0,    vs32,   alpha_r
+       xvmaddadp       vs1,    vs33,   alpha_r
+       xvmaddadp       vs2,    vs34,   alpha_r
+       xvmaddadp       vs3,    vs35,   alpha_r
+#else
+       xvmuldp         vs0,    vs32,   alpha_r
+       xvmuldp         vs1,    vs33,   alpha_r
+       xvmuldp         vs2,    vs34,   alpha_r
+       xvmuldp         vs3,    vs35,   alpha_r
+#endif
+
+       stxvd2x         vs0,    0,      T1
+       stxvd2x         vs1,    o16,    T1
+       stxvd2x         vs2,    o32,    T1
+       stxvd2x         vs3,    o48,    T1
+
+       add             T1,     T1,     LDC
+
+#ifndef TRMMKERNEL
+       lxvd2x          vs8,    0,      T1
+       lxvd2x          vs9,    o16,    T1
+       lxvd2x          vs10,   o32,    T1
+       lxvd2x          vs11,   o48,    T1
+#endif
+
+#ifndef TRMMKERNEL
+       xvmaddadp       vs8,    vs40,   alpha_r
+       xvmaddadp       vs9,    vs41,   alpha_r
+       xvmaddadp       vs10,   vs42,   alpha_r
+       xvmaddadp       vs11,   vs43,   alpha_r
+#else
+       xvmuldp         vs8,    vs40,   alpha_r
+       xvmuldp         vs9,    vs41,   alpha_r
+       xvmuldp         vs10,   vs42,   alpha_r
+       xvmuldp         vs11,   vs43,   alpha_r
+#endif
+
+       stxvd2x         vs8,    0,      T1
+       stxvd2x         vs9,    o16,    T1
+       stxvd2x         vs10,   o32,    T1
+       stxvd2x         vs11,   o48,    T1
+
+       addi            CO,     CO,     64
+
+.endm
+
+/*********************************************************************
+* Macros for N=2, M=4                                                *
+*********************************************************************/
+
+.macro LOAD2x4_1
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+
+       addi            AO, AO, 32
+       addi            BO, BO, 16
+
+.endm
+
+.macro KERNEL2x4_I1
+
+       lxvd2x  vs8,    0,      AO
+       lxvd2x  vs9,    o16,    AO
+
+       lxvdsx  vs28,   0,      BO
+       lxvdsx  vs29,   o8,     BO
+
+       addi            AO, AO, 32
+       addi            BO, BO, 16
+
+
+       xvmuldp                 vs32,   vs0,    vs24
+       xvmuldp                 vs33,   vs1,    vs24
+
+       xvmuldp                 vs40,   vs0,    vs25
+       xvmuldp                 vs41,   vs1,    vs25
+
+.endm
+
+.macro KERNEL2x4_1
+
+       lxvd2x  vs8,    0,      AO
+       lxvd2x  vs9,    o16,    AO
+
+       lxvdsx  vs28,   0,      BO
+       lxvdsx  vs29,   o8,     BO
+
+       addi            AO, AO, 32
+       addi            BO, BO, 16
+
+
+       xvmaddadp               vs32,   vs0,    vs24
+       xvmaddadp               vs33,   vs1,    vs24
+
+       xvmaddadp               vs40,   vs0,    vs25
+       xvmaddadp               vs41,   vs1,    vs25
+
+.endm
+
+.macro KERNEL2x4_2
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+
+       addi            AO, AO, 32
+       addi            BO, BO, 16
+
+
+       xvmaddadp               vs32,   vs8,    vs28
+       xvmaddadp               vs33,   vs9,    vs28
+
+       xvmaddadp               vs40,   vs8,    vs29
+       xvmaddadp               vs41,   vs9,    vs29
+
+.endm
+
+.macro KERNEL2x4_E2
+
+
+       xvmaddadp               vs32,   vs8,    vs28
+       xvmaddadp               vs33,   vs9,    vs28
+
+       xvmaddadp               vs40,   vs8,    vs29
+       xvmaddadp               vs41,   vs9,    vs29
+
+.endm
+
+.macro KERNEL2x4_SUBI1
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+
+       addi            AO, AO, 32
+       addi            BO, BO, 16
+
+
+       xvmuldp                 vs32,   vs0,    vs24
+       xvmuldp                 vs33,   vs1,    vs24
+
+       xvmuldp                 vs40,   vs0,    vs25
+       xvmuldp                 vs41,   vs1,    vs25
+
+.endm
+
+.macro KERNEL2x4_SUB1
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+
+       addi            AO, AO, 32
+       addi            BO, BO, 16
+
+
+       xvmaddadp               vs32,   vs0,    vs24
+       xvmaddadp               vs33,   vs1,    vs24
+
+       xvmaddadp               vs40,   vs0,    vs25
+       xvmaddadp               vs41,   vs1,    vs25
+
+.endm
+
+.macro SAVE2x4
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+       lxvd2x          vs0,    0,      T1
+       lxvd2x          vs1,    o16,    T1
+#endif
+
+#ifndef TRMMKERNEL
+       xvmaddadp       vs0,    vs32,   alpha_r
+       xvmaddadp       vs1,    vs33,   alpha_r
+#else
+       xvmuldp         vs0,    vs32,   alpha_r
+       xvmuldp         vs1,    vs33,   alpha_r
+#endif
+
+       stxvd2x         vs0,    0,      T1
+       stxvd2x         vs1,    o16,    T1
+
+       add             T1,     T1,     LDC
+
+#ifndef TRMMKERNEL
+       lxvd2x          vs8,    0,      T1
+       lxvd2x          vs9,    o16,    T1
+#endif
+
+#ifndef TRMMKERNEL
+       xvmaddadp       vs8,    vs40,   alpha_r
+       xvmaddadp       vs9,    vs41,   alpha_r
+#else
+       xvmuldp         vs8,    vs40,   alpha_r
+       xvmuldp         vs9,    vs41,   alpha_r
+#endif
+
+       stxvd2x         vs8,    0,      T1
+       stxvd2x         vs9,    o16,    T1
+
+       addi            CO,     CO,     32
+
+.endm
+
+/*********************************************************************
+* Macros for N=2, M=2                                                *
+*********************************************************************/
+
+.macro LOAD2x2_1
+
+       lxvd2x  vs0,    0,      AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+
+       addi            AO, AO, 16
+       addi            BO, BO, 16
+
+.endm
+
+.macro KERNEL2x2_I1
+
+       lxvd2x  vs8,    0,      AO
+
+       lxvdsx  vs28,   0,      BO
+       lxvdsx  vs29,   o8,     BO
+
+       addi            AO, AO, 16
+       addi            BO, BO, 16
+
+
+       xvmuldp                 vs32,   vs0,    vs24
+
+       xvmuldp                 vs40,   vs0,    vs25
+
+.endm
+
+.macro KERNEL2x2_1
+
+       lxvd2x  vs8,    0,      AO
+
+       lxvdsx  vs28,   0,      BO
+       lxvdsx  vs29,   o8,     BO
+
+       addi            AO, AO, 16
+       addi            BO, BO, 16
+
+
+       xvmaddadp               vs32,   vs0,    vs24
+
+       xvmaddadp               vs40,   vs0,    vs25
+
+.endm
+
+.macro KERNEL2x2_2
+
+       lxvd2x  vs0,    0,      AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+
+       addi            AO, AO, 16
+       addi            BO, BO, 16
+
+
+       xvmaddadp               vs32,   vs8,    vs28
+
+       xvmaddadp               vs40,   vs8,    vs29
+
+.endm
+
+.macro KERNEL2x2_E2
+
+
+       xvmaddadp               vs32,   vs8,    vs28
+
+       xvmaddadp               vs40,   vs8,    vs29
+
+.endm
+
+.macro KERNEL2x2_SUBI1
+
+       lxvd2x  vs0,    0,      AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+
+       addi            AO, AO, 16
+       addi            BO, BO, 16
+
+
+       xvmuldp                 vs32,   vs0,    vs24
+
+       xvmuldp                 vs40,   vs0,    vs25
+
+.endm
+
+.macro KERNEL2x2_SUB1
+
+       lxvd2x  vs0,    0,      AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+
+       addi            AO, AO, 16
+       addi            BO, BO, 16
+
+
+       xvmaddadp               vs32,   vs0,    vs24
+
+       xvmaddadp               vs40,   vs0,    vs25
+
+.endm
+
+.macro SAVE2x2
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+       lxvd2x          vs0,    0,      T1
+#endif
+
+#ifndef TRMMKERNEL
+       xvmaddadp       vs0,    vs32,   alpha_r
+#else
+       xvmuldp         vs0,    vs32,   alpha_r
+#endif
+
+       stxvd2x         vs0,    0,      T1
+
+       add             T1,     T1,     LDC
+
+#ifndef TRMMKERNEL
+       lxvd2x          vs8,    0,      T1
+#endif
+
+#ifndef TRMMKERNEL
+       xvmaddadp       vs8,    vs40,   alpha_r
+#else
+       xvmuldp         vs8,    vs40,   alpha_r
+#endif
+
+       stxvd2x         vs8,    0,      T1
+
+       addi            CO,     CO,     16
+
+.endm
+
+/*********************************************************************
+* Macros for N=2, M=1                                                *
+*********************************************************************/
+
+.macro LOAD2x1_1
+
+       lxsdx   vs0,    0,      AO
+
+       lxsdx   vs24,   0,      BO
+       lxsdx   vs25,   o8,     BO
+
+       addi            AO, AO, 8
+       addi            BO, BO, 16
+
+.endm
+
+.macro KERNEL2x1_I1
+
+       lxsdx   vs8,    0,      AO
+
+       lxsdx   vs28,   0,      BO
+       lxsdx   vs29,   o8,     BO
+
+       addi            AO, AO, 8
+       addi            BO, BO, 16
+
+
+       xsmuldp                 vs32,   vs0,    vs24
+
+       xsmuldp                 vs40,   vs0,    vs25
+
+.endm
+
+.macro KERNEL2x1_1
+
+       lxsdx   vs8,    0,      AO
+
+       lxsdx   vs28,   0,      BO
+       lxsdx   vs29,   o8,     BO
+
+       addi            AO, AO, 8
+       addi            BO, BO, 16
+
+
+       xsmaddadp               vs32,   vs0,    vs24
+
+       xsmaddadp               vs40,   vs0,    vs25
+
+.endm
+
+.macro KERNEL2x1_2
+
+       lxsdx   vs0,    0,      AO
+
+       lxsdx   vs24,   0,      BO
+       lxsdx   vs25,   o8,     BO
+
+       addi            AO, AO, 8
+       addi            BO, BO, 16
+
+
+       xsmaddadp               vs32,   vs8,    vs28
+
+       xsmaddadp               vs40,   vs8,    vs29
+
+.endm
+
+.macro KERNEL2x1_E2
+
+
+       xsmaddadp               vs32,   vs8,    vs28
+
+       xsmaddadp               vs40,   vs8,    vs29
+
+.endm
+
+.macro KERNEL2x1_SUBI1
+
+       lxsdx   vs0,    0,      AO
+
+       lxsdx   vs24,   0,      BO
+       lxsdx   vs25,   o8,     BO
+
+       addi            AO, AO, 8
+       addi            BO, BO, 16
+
+
+       xsmuldp                 vs32,   vs0,    vs24
+
+       xsmuldp                 vs40,   vs0,    vs25
+
+.endm
+
+.macro KERNEL2x1_SUB1
+
+       lxsdx   vs0,    0,      AO
+
+       lxsdx   vs24,   0,      BO
+       lxsdx   vs25,   o8,     BO
+
+       addi            AO, AO, 8
+       addi            BO, BO, 16
+
+
+       xsmaddadp               vs32,   vs0,    vs24
+
+       xsmaddadp               vs40,   vs0,    vs25
+
+.endm
+
+.macro SAVE2x1
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+       lxsdx           vs0,    0,      T1
+#endif
+
+#ifndef TRMMKERNEL
+       xsmaddadp       vs0,    vs32,   alpha_r
+#else
+       xsmuldp         vs0,    vs32,   alpha_r
+#endif
+
+       stxsdx          vs0,    0,      T1
+
+       add             T1,     T1,     LDC
+
+#ifndef TRMMKERNEL
+       lxsdx           vs8,    0,      T1
+#endif
+
+#ifndef TRMMKERNEL
+       xsmaddadp       vs8,    vs40,   alpha_r
+#else
+       xsmuldp         vs8,    vs40,   alpha_r
+#endif
+
+       stxsdx          vs8,    0,      T1
+
+       addi            CO,     CO,     8
+
+.endm
+
+/*********************************************************************
+* Macros for N=1, M=16                                               *
+*********************************************************************/
+
+.macro LOAD1x16_1
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+       lxvd2x  vs2,    o32,    AO
+       lxvd2x  vs3,    o48,    AO
+
+       lxvdsx  vs24,   0,      BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 8
+
+       lxvd2x  vs4,    0,      AO
+       lxvd2x  vs5,    o16,    AO
+       lxvd2x  vs6,    o32,    AO
+       lxvd2x  vs7,    o48,    AO
+
+       addi            AO, AO, 64
+
+.endm
+
+.macro KERNEL1x16_I1
+
+       lxvd2x  vs8,    0,      AO
+       lxvd2x  vs9,    o16,    AO
+       lxvd2x  vs10,   o32,    AO
+       lxvd2x  vs11,   o48,    AO
+
+       lxvdsx  vs28,   0,      BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 8
+
+       lxvd2x  vs12,   0,      AO
+       lxvd2x  vs13,   o16,    AO
+       lxvd2x  vs14,   o32,    AO
+       lxvd2x  vs15,   o48,    AO
+
+       addi            AO, AO, 64
+
+
+       xvmuldp                 vs32,   vs0,    vs24
+       xvmuldp                 vs33,   vs1,    vs24
+       xvmuldp                 vs34,   vs2,    vs24
+       xvmuldp                 vs35,   vs3,    vs24
+       xvmuldp                 vs36,   vs4,    vs24
+       xvmuldp                 vs37,   vs5,    vs24
+       xvmuldp                 vs38,   vs6,    vs24
+       xvmuldp                 vs39,   vs7,    vs24
+
+.endm
+
+.macro KERNEL1x16_1
+
+       lxvd2x  vs8,    0,      AO
+       lxvd2x  vs9,    o16,    AO
+       lxvd2x  vs10,   o32,    AO
+       lxvd2x  vs11,   o48,    AO
+
+       lxvdsx  vs28,   0,      BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 8
+
+       lxvd2x  vs12,   0,      AO
+       lxvd2x  vs13,   o16,    AO
+       lxvd2x  vs14,   o32,    AO
+       lxvd2x  vs15,   o48,    AO
+
+       addi            AO, AO, 64
+
+
+       xvmaddadp               vs32,   vs0,    vs24
+       xvmaddadp               vs33,   vs1,    vs24
+       xvmaddadp               vs34,   vs2,    vs24
+       xvmaddadp               vs35,   vs3,    vs24
+       xvmaddadp               vs36,   vs4,    vs24
+       xvmaddadp               vs37,   vs5,    vs24
+       xvmaddadp               vs38,   vs6,    vs24
+       xvmaddadp               vs39,   vs7,    vs24
+
+.endm
+
+.macro KERNEL1x16_2
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+       lxvd2x  vs2,    o32,    AO
+       lxvd2x  vs3,    o48,    AO
+
+       lxvdsx  vs24,   0,      BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 8
+
+       lxvd2x  vs4,    0,      AO
+       lxvd2x  vs5,    o16,    AO
+       lxvd2x  vs6,    o32,    AO
+       lxvd2x  vs7,    o48,    AO
+
+       addi            AO, AO, 64
+
+
+       xvmaddadp               vs32,   vs8,    vs28
+       xvmaddadp               vs33,   vs9,    vs28
+       xvmaddadp               vs34,   vs10,   vs28
+       xvmaddadp               vs35,   vs11,   vs28
+       xvmaddadp               vs36,   vs12,   vs28
+       xvmaddadp               vs37,   vs13,   vs28
+       xvmaddadp               vs38,   vs14,   vs28
+       xvmaddadp               vs39,   vs15,   vs28
+
+.endm
+
+.macro KERNEL1x16_E2
+
+
+       xvmaddadp               vs32,   vs8,    vs28
+       xvmaddadp               vs33,   vs9,    vs28
+       xvmaddadp               vs34,   vs10,   vs28
+       xvmaddadp               vs35,   vs11,   vs28
+       xvmaddadp               vs36,   vs12,   vs28
+       xvmaddadp               vs37,   vs13,   vs28
+       xvmaddadp               vs38,   vs14,   vs28
+       xvmaddadp               vs39,   vs15,   vs28
+
+.endm
+
+.macro KERNEL1x16_SUBI1
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+       lxvd2x  vs2,    o32,    AO
+       lxvd2x  vs3,    o48,    AO
+
+       lxvdsx  vs24,   0,      BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 8
+
+       lxvd2x  vs4,    0,      AO
+       lxvd2x  vs5,    o16,    AO
+       lxvd2x  vs6,    o32,    AO
+       lxvd2x  vs7,    o48,    AO
+
+       addi            AO, AO, 64
+
+
+       xvmuldp                 vs32,   vs0,    vs24
+       xvmuldp                 vs33,   vs1,    vs24
+       xvmuldp                 vs34,   vs2,    vs24
+       xvmuldp                 vs35,   vs3,    vs24
+       xvmuldp                 vs36,   vs4,    vs24
+       xvmuldp                 vs37,   vs5,    vs24
+       xvmuldp                 vs38,   vs6,    vs24
+       xvmuldp                 vs39,   vs7,    vs24
+
+.endm
+
+.macro KERNEL1x16_SUB1
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+       lxvd2x  vs2,    o32,    AO
+       lxvd2x  vs3,    o48,    AO
+
+       lxvdsx  vs24,   0,      BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 8
+
+       lxvd2x  vs4,    0,      AO
+       lxvd2x  vs5,    o16,    AO
+       lxvd2x  vs6,    o32,    AO
+       lxvd2x  vs7,    o48,    AO
+
+       addi            AO, AO, 64
+
+
+       xvmaddadp               vs32,   vs0,    vs24
+       xvmaddadp               vs33,   vs1,    vs24
+       xvmaddadp               vs34,   vs2,    vs24
+       xvmaddadp               vs35,   vs3,    vs24
+       xvmaddadp               vs36,   vs4,    vs24
+       xvmaddadp               vs37,   vs5,    vs24
+       xvmaddadp               vs38,   vs6,    vs24
+       xvmaddadp               vs39,   vs7,    vs24
+
+.endm
+
+.macro SAVE1x16
+
+       mr              T1,     CO
+       addi            T2,     T1,     64
+
+#ifndef TRMMKERNEL
+       lxvd2x          vs0,    0,      T1
+       lxvd2x          vs1,    o16,    T1
+       lxvd2x          vs2,    o32,    T1
+       lxvd2x          vs3,    o48,    T1
+
+       lxvd2x          vs4,    0,      T2
+       lxvd2x          vs5,    o16,    T2
+       lxvd2x          vs6,    o32,    T2
+       lxvd2x          vs7,    o48,    T2
+#endif
+
+#ifndef TRMMKERNEL
+       xvmaddadp       vs0,    vs32,   alpha_r
+       xvmaddadp       vs1,    vs33,   alpha_r
+       xvmaddadp       vs2,    vs34,   alpha_r
+       xvmaddadp       vs3,    vs35,   alpha_r
+       xvmaddadp       vs4,    vs36,   alpha_r
+       xvmaddadp       vs5,    vs37,   alpha_r
+       xvmaddadp       vs6,    vs38,   alpha_r
+       xvmaddadp       vs7,    vs39,   alpha_r
+#else
+       xvmuldp         vs0,    vs32,   alpha_r
+       xvmuldp         vs1,    vs33,   alpha_r
+       xvmuldp         vs2,    vs34,   alpha_r
+       xvmuldp         vs3,    vs35,   alpha_r
+       xvmuldp         vs4,    vs36,   alpha_r
+       xvmuldp         vs5,    vs37,   alpha_r
+       xvmuldp         vs6,    vs38,   alpha_r
+       xvmuldp         vs7,    vs39,   alpha_r
+#endif
+
+       stxvd2x         vs0,    0,      T1
+       stxvd2x         vs1,    o16,    T1
+       stxvd2x         vs2,    o32,    T1
+       stxvd2x         vs3,    o48,    T1
+
+       stxvd2x         vs4,    0,      T2
+       stxvd2x         vs5,    o16,    T2
+       stxvd2x         vs6,    o32,    T2
+       stxvd2x         vs7,    o48,    T2
+
+       addi            CO,     CO,     128
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=8                                                *
+*********************************************************************/
+
+.macro LOAD1x8_1
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+       lxvd2x  vs2,    o32,    AO
+       lxvd2x  vs3,    o48,    AO
+
+       lxvdsx  vs24,   0,      BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 8
+
+.endm
+
+.macro KERNEL1x8_I1
+
+       lxvd2x  vs8,    0,      AO
+       lxvd2x  vs9,    o16,    AO
+       lxvd2x  vs10,   o32,    AO
+       lxvd2x  vs11,   o48,    AO
+
+       lxvdsx  vs28,   0,      BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 8
+
+
+       xvmuldp                 vs32,   vs0,    vs24
+       xvmuldp                 vs33,   vs1,    vs24
+       xvmuldp                 vs34,   vs2,    vs24
+       xvmuldp                 vs35,   vs3,    vs24
+
+.endm
+
+.macro KERNEL1x8_1
+
+       lxvd2x  vs8,    0,      AO
+       lxvd2x  vs9,    o16,    AO
+       lxvd2x  vs10,   o32,    AO
+       lxvd2x  vs11,   o48,    AO
+
+       lxvdsx  vs28,   0,      BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 8
+
+
+       xvmaddadp               vs32,   vs0,    vs24
+       xvmaddadp               vs33,   vs1,    vs24
+       xvmaddadp               vs34,   vs2,    vs24
+       xvmaddadp               vs35,   vs3,    vs24
+
+.endm
+
+.macro KERNEL1x8_2
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+       lxvd2x  vs2,    o32,    AO
+       lxvd2x  vs3,    o48,    AO
+
+       lxvdsx  vs24,   0,      BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 8
+
+
+       xvmaddadp               vs32,   vs8,    vs28
+       xvmaddadp               vs33,   vs9,    vs28
+       xvmaddadp               vs34,   vs10,   vs28
+       xvmaddadp               vs35,   vs11,   vs28
+
+.endm
+
+.macro KERNEL1x8_E2
+
+
+       xvmaddadp               vs32,   vs8,    vs28
+       xvmaddadp               vs33,   vs9,    vs28
+       xvmaddadp               vs34,   vs10,   vs28
+       xvmaddadp               vs35,   vs11,   vs28
+
+.endm
+
+.macro KERNEL1x8_SUBI1
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+       lxvd2x  vs2,    o32,    AO
+       lxvd2x  vs3,    o48,    AO
+
+       lxvdsx  vs24,   0,      BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 8
+
+
+       xvmuldp                 vs32,   vs0,    vs24
+       xvmuldp                 vs33,   vs1,    vs24
+       xvmuldp                 vs34,   vs2,    vs24
+       xvmuldp                 vs35,   vs3,    vs24
+
+.endm
+
+.macro KERNEL1x8_SUB1
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+       lxvd2x  vs2,    o32,    AO
+       lxvd2x  vs3,    o48,    AO
+
+       lxvdsx  vs24,   0,      BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 8
+
+
+       xvmaddadp               vs32,   vs0,    vs24
+       xvmaddadp               vs33,   vs1,    vs24
+       xvmaddadp               vs34,   vs2,    vs24
+       xvmaddadp               vs35,   vs3,    vs24
+
+.endm
+
+.macro SAVE1x8
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+       lxvd2x          vs0,    0,      T1
+       lxvd2x          vs1,    o16,    T1
+       lxvd2x          vs2,    o32,    T1
+       lxvd2x          vs3,    o48,    T1
+#endif
+
+#ifndef TRMMKERNEL
+       xvmaddadp       vs0,    vs32,   alpha_r
+       xvmaddadp       vs1,    vs33,   alpha_r
+       xvmaddadp       vs2,    vs34,   alpha_r
+       xvmaddadp       vs3,    vs35,   alpha_r
+#else
+       xvmuldp         vs0,    vs32,   alpha_r
+       xvmuldp         vs1,    vs33,   alpha_r
+       xvmuldp         vs2,    vs34,   alpha_r
+       xvmuldp         vs3,    vs35,   alpha_r
+#endif
+
+       stxvd2x         vs0,    0,      T1
+       stxvd2x         vs1,    o16,    T1
+       stxvd2x         vs2,    o32,    T1
+       stxvd2x         vs3,    o48,    T1
+
+       addi            CO,     CO,     64
+
+.endm
+
+/*********************************************************************
+* Macros for N=1, M=4                                                *
+*********************************************************************/
+
+.macro LOAD1x4_1
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+
+       lxvdsx  vs24,   0,      BO
+
+       addi            AO, AO, 32
+       addi            BO, BO, 8
+
+.endm
+
+.macro KERNEL1x4_I1
+
+       lxvd2x  vs8,    0,      AO
+       lxvd2x  vs9,    o16,    AO
+
+       lxvdsx  vs28,   0,      BO
+
+       addi            AO, AO, 32
+       addi            BO, BO, 8
+
+
+       xvmuldp                 vs32,   vs0,    vs24
+       xvmuldp                 vs33,   vs1,    vs24
+
+.endm
+
+.macro KERNEL1x4_1
+
+       lxvd2x  vs8,    0,      AO
+       lxvd2x  vs9,    o16,    AO
+
+       lxvdsx  vs28,   0,      BO
+
+       addi            AO, AO, 32
+       addi            BO, BO, 8
+
+
+       xvmaddadp               vs32,   vs0,    vs24
+       xvmaddadp               vs33,   vs1,    vs24
+
+.endm
+
+.macro KERNEL1x4_2
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+
+       lxvdsx  vs24,   0,      BO
+
+       addi            AO, AO, 32
+       addi            BO, BO, 8
+
+
+       xvmaddadp               vs32,   vs8,    vs28
+       xvmaddadp               vs33,   vs9,    vs28
+
+.endm
+
+.macro KERNEL1x4_E2
+
+
+       xvmaddadp               vs32,   vs8,    vs28
+       xvmaddadp               vs33,   vs9,    vs28
+
+.endm
+
+.macro KERNEL1x4_SUBI1
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+
+       lxvdsx  vs24,   0,      BO
+
+       addi            AO, AO, 32
+       addi            BO, BO, 8
+
+
+       xvmuldp                 vs32,   vs0,    vs24
+       xvmuldp                 vs33,   vs1,    vs24
+
+.endm
+
+.macro KERNEL1x4_SUB1
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+
+       lxvdsx  vs24,   0,      BO
+
+       addi            AO, AO, 32
+       addi            BO, BO, 8
+
+
+       xvmaddadp               vs32,   vs0,    vs24
+       xvmaddadp               vs33,   vs1,    vs24
+
+.endm
+
+.macro SAVE1x4
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+       lxvd2x          vs0,    0,      T1
+       lxvd2x          vs1,    o16,    T1
+#endif
+
+#ifndef TRMMKERNEL
+       xvmaddadp       vs0,    vs32,   alpha_r
+       xvmaddadp       vs1,    vs33,   alpha_r
+#else
+       xvmuldp         vs0,    vs32,   alpha_r
+       xvmuldp         vs1,    vs33,   alpha_r
+#endif
+
+       stxvd2x         vs0,    0,      T1
+       stxvd2x         vs1,    o16,    T1
+
+       addi            CO,     CO,     32
+
+.endm
+
+/*********************************************************************
+* Macros for N=1, M=2                                                *
+*********************************************************************/
+
+.macro LOAD1x2_1
+
+       lxvd2x  vs0,    0,      AO
+
+       lxvdsx  vs24,   0,      BO
+
+       addi            AO, AO, 16
+       addi            BO, BO, 8
+
+.endm
+
+.macro KERNEL1x2_I1
+
+       lxvd2x  vs8,    0,      AO
+
+       lxvdsx  vs28,   0,      BO
+
+       addi            AO, AO, 16
+       addi            BO, BO, 8
+
+
+       xvmuldp                 vs32,   vs0,    vs24
+
+.endm
+
+.macro KERNEL1x2_1
+
+       lxvd2x  vs8,    0,      AO
+
+       lxvdsx  vs28,   0,      BO
+
+       addi            AO, AO, 16
+       addi            BO, BO, 8
+
+
+       xvmaddadp               vs32,   vs0,    vs24
+
+.endm
+
+.macro KERNEL1x2_2
+
+       lxvd2x  vs0,    0,      AO
+
+       lxvdsx  vs24,   0,      BO
+
+       addi            AO, AO, 16
+       addi            BO, BO, 8
+
+
+       xvmaddadp               vs32,   vs8,    vs28
+
+.endm
+
+.macro KERNEL1x2_E2
+
+
+       xvmaddadp               vs32,   vs8,    vs28
+
+.endm
+
+.macro KERNEL1x2_SUBI1
+
+       lxvd2x  vs0,    0,      AO
+
+       lxvdsx  vs24,   0,      BO
+
+       addi            AO, AO, 16
+       addi            BO, BO, 8
+
+
+       xvmuldp                 vs32,   vs0,    vs24
+
+.endm
+
+.macro KERNEL1x2_SUB1
+
+       lxvd2x  vs0,    0,      AO
+
+       lxvdsx  vs24,   0,      BO
+
+       addi            AO, AO, 16
+       addi            BO, BO, 8
+
+
+       xvmaddadp               vs32,   vs0,    vs24
+
+.endm
+
+.macro SAVE1x2
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+       lxvd2x          vs0,    0,      T1
+#endif
+
+#ifndef TRMMKERNEL
+       xvmaddadp       vs0,    vs32,   alpha_r
+#else
+       xvmuldp         vs0,    vs32,   alpha_r
+#endif
+
+       stxvd2x         vs0,    0,      T1
+
+       addi            CO,     CO,     16
+
+.endm
+
+/*********************************************************************
+* Macros for N=1, M=1                                                *
+*********************************************************************/
+
+.macro LOAD1x1_1
+
+       lxsdx   vs0,    0,      AO
+
+       lxsdx   vs24,   0,      BO
+
+       addi            AO, AO, 8
+       addi            BO, BO, 8
+
+.endm
+
+.macro KERNEL1x1_I1
+
+       lxsdx   vs8,    0,      AO
+
+       lxsdx   vs28,   0,      BO
+
+       addi            AO, AO, 8
+       addi            BO, BO, 8
+
+
+       xsmuldp                 vs32,   vs0,    vs24
+
+.endm
+
+.macro KERNEL1x1_1
+
+       lxsdx   vs8,    0,      AO
+
+       lxsdx   vs28,   0,      BO
+
+       addi            AO, AO, 8
+       addi            BO, BO, 8
+
+
+       xsmaddadp               vs32,   vs0,    vs24
+
+.endm
+
+.macro KERNEL1x1_2
+
+       lxsdx   vs0,    0,      AO
+
+       lxsdx   vs24,   0,      BO
+
+       addi            AO, AO, 8
+       addi            BO, BO, 8
+
+
+       xsmaddadp               vs32,   vs8,    vs28
+
+.endm
+
+.macro KERNEL1x1_E2
+
+
+       xsmaddadp               vs32,   vs8,    vs28
+
+.endm
+
+.macro KERNEL1x1_SUBI1
+
+       lxsdx   vs0,    0,      AO
+
+       lxsdx   vs24,   0,      BO
+
+       addi            AO, AO, 8
+       addi            BO, BO, 8
+
+
+       xsmuldp                 vs32,   vs0,    vs24
+
+.endm
+
+.macro KERNEL1x1_SUB1
+
+       lxsdx   vs0,    0,      AO
+
+       lxsdx   vs24,   0,      BO
+
+       addi            AO, AO, 8
+       addi            BO, BO, 8
+
+
+       xsmaddadp               vs32,   vs0,    vs24
+
+.endm
+
+.macro SAVE1x1
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+       lxsdx           vs0,    0,      T1
+#endif
+
+#ifndef TRMMKERNEL
+       xsmaddadp       vs0,    vs32,   alpha_r
+#else
+       xsmuldp         vs0,    vs32,   alpha_r
+#endif
+
+       stxsdx          vs0,    0,      T1
+
+       addi            CO,     CO,     8
+
+.endm
+
diff --git a/kernel/power/dtrmm_kernel_16x4_power8.S b/kernel/power/dtrmm_kernel_16x4_power8.S
new file mode 100644 (file)
index 0000000..c892c65
--- /dev/null
@@ -0,0 +1,327 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+#ifndef __64BIT__
+#define LOAD   lwz
+#else
+#define LOAD   ld
+#endif
+
+#ifdef __64BIT__
+#define STACKSIZE 320
+#define ALPHA_SP   296(SP)
+#define FZERO  304(SP)
+#else
+#define STACKSIZE 240
+#define ALPHA_SP   224(SP)
+#define FZERO  232(SP)
+#endif
+
+#define        M       r3
+#define        N       r4
+#define        K       r5
+
+#ifdef linux
+#ifndef __64BIT__
+#define A      r6
+#define        B       r7
+#define        C       r8
+#define        LDC     r9
+#define OFFSET r10
+#else
+#define A      r7
+#define        B       r8
+#define        C       r9
+#define        LDC     r10
+#define OFFSET r6
+#endif
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#if !defined(__64BIT__) && defined(DOUBLE)
+#define A      r8
+#define        B       r9
+#define        C       r10
+#define        LDC     r7
+#define OFFSET r6
+#else
+#define A      r7
+#define        B       r8
+#define        C       r9
+#define        LDC     r10
+#define OFFSET r6
+#endif
+#endif
+
+#define alpha_r vs18
+
+#define o0     0
+
+#define K1     r13
+#define KKK    r14
+#define o8     r15
+#define o24    r16
+#define ALPHA  r17
+#define L      r18
+#define T1     r19
+#define KK     r20
+#define BB     r21
+#define        I       r22
+#define J      r23
+#define AO     r24
+#define        BO      r25
+#define        CO      r26
+#define o16    r27
+#define        o32     r28
+#define        o48     r29
+
+#define PRE    r30
+#define T2     r31
+
+#include "dgemm_macros_16x4_power8.S"
+
+
+#ifndef NEEDPARAM
+
+       PROLOGUE
+       PROFCODE
+
+       addi    SP, SP, -STACKSIZE
+       li      r0, 0
+
+       stfd    f14,    0(SP)
+       stfd    f15,    8(SP)
+       stfd    f16,   16(SP)
+       stfd    f17,   24(SP)
+
+       stfd    f18,   32(SP)
+       stfd    f19,   40(SP)
+       stfd    f20,   48(SP)
+       stfd    f21,   56(SP)
+
+       stfd    f22,   64(SP)
+       stfd    f23,   72(SP)
+       stfd    f24,   80(SP)
+       stfd    f25,   88(SP)
+
+       stfd    f26,   96(SP)
+       stfd    f27,  104(SP)
+       stfd    f28,  112(SP)
+       stfd    f29,  120(SP)
+
+       stfd    f30,  128(SP)
+       stfd    f31,  136(SP)
+
+#ifdef __64BIT__
+       std     r31,  144(SP)
+       std     r30,  152(SP)
+       std     r29,  160(SP)
+       std     r28,  168(SP)
+       std     r27,  176(SP)
+       std     r26,  184(SP)
+       std     r25,  192(SP)
+       std     r24,  200(SP)
+       std     r23,  208(SP)
+       std     r22,  216(SP)
+       std     r21,  224(SP)
+       std     r20,  232(SP)
+       std     r19,  240(SP)
+       std     r18,  248(SP)
+       std     r17,  256(SP)
+       std     r16,  264(SP)
+       std     r15,  272(SP)
+       std     r14,  280(SP)
+       std     r13,  288(SP)
+#else
+       stw     r31,  144(SP)
+       stw     r30,  148(SP)
+       stw     r29,  152(SP)
+       stw     r28,  156(SP)
+       stw     r27,  160(SP)
+       stw     r26,  164(SP)
+       stw     r25,  168(SP)
+       stw     r24,  172(SP)
+       stw     r23,  176(SP)
+       stw     r22,  180(SP)
+       stw     r21,  184(SP)
+       stw     r20,  188(SP)
+       stw     r19,  192(SP)
+       stw     r18,  196(SP)
+       stw     r17,  200(SP)
+       stw     r16,  204(SP)
+       stw     r15,  208(SP)
+       stw     r14,  212(SP)
+       stw     r13,  216(SP)
+#endif
+
+       stfd    f1,  ALPHA_SP
+       stw     r0,  FZERO
+
+#if defined(_AIX) || defined(__APPLE__)
+#if !defined(__64BIT__) && defined(DOUBLE)
+       lwz     LDC, FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+#endif
+
+       slwi    LDC, LDC, BASE_SHIFT
+
+#if defined(TRMMKERNEL)
+#if defined(linux) && defined(__64BIT__)
+       ld      OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#ifdef __64BIT__
+       ld      OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
+#else
+#ifdef DOUBLE
+       lwz     OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
+#else
+       lwz     OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+#endif
+#endif
+#endif
+
+       mr      KK, OFFSET
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        neg     KK, KK
+#endif
+
+       cmpwi   cr0, M, 0
+       ble     L999_H1
+       cmpwi   cr0, N, 0
+       ble     L999_H1
+       cmpwi   cr0, K, 0
+       ble     L999_H1
+
+#ifdef __64BIT__
+       addi    ALPHA, SP, 296
+#else
+       addi    ALPHA, SP, 224
+#endif
+
+       li      PRE, 256 
+       li      o8 , 8
+       li      o16, 16
+       li      o24, 24
+       li      o32, 32
+       li      o48, 48
+
+       lxvdsx  alpha_r, 0, ALPHA
+
+#include "dtrmm_logic_16x4_power8.S"
+
+L999:
+       addi    r3, 0, 0
+
+       lfd     f14,    0(SP)
+       lfd     f15,    8(SP)
+       lfd     f16,   16(SP)
+       lfd     f17,   24(SP)
+
+       lfd     f18,   32(SP)
+       lfd     f19,   40(SP)
+       lfd     f20,   48(SP)
+       lfd     f21,   56(SP)
+
+       lfd     f22,   64(SP)
+       lfd     f23,   72(SP)
+       lfd     f24,   80(SP)
+       lfd     f25,   88(SP)
+
+       lfd     f26,   96(SP)
+       lfd     f27,  104(SP)
+       lfd     f28,  112(SP)
+       lfd     f29,  120(SP)
+
+       lfd     f30,  128(SP)
+       lfd     f31,  136(SP)
+
+#ifdef __64BIT__
+       ld      r31,  144(SP)
+       ld      r30,  152(SP)
+       ld      r29,  160(SP)
+       ld      r28,  168(SP)
+       ld      r27,  176(SP)
+       ld      r26,  184(SP)
+       ld      r25,  192(SP)
+       ld      r24,  200(SP)
+       ld      r23,  208(SP)
+       ld      r22,  216(SP)
+       ld      r21,  224(SP)
+       ld      r20,  232(SP)
+       ld      r19,  240(SP)
+       ld      r18,  248(SP)
+       ld      r17,  256(SP)
+       ld      r16,  264(SP)
+       ld      r15,  272(SP)
+       ld      r14,  280(SP)
+       ld      r13,  288(SP)
+#else
+       lwz     r31,  144(SP)
+       lwz     r30,  148(SP)
+       lwz     r29,  152(SP)
+       lwz     r28,  156(SP)
+       lwz     r27,  160(SP)
+       lwz     r26,  164(SP)
+       lwz     r25,  168(SP)
+       lwz     r24,  172(SP)
+       lwz     r23,  176(SP)
+       lwz     r22,  180(SP)
+       lwz     r21,  184(SP)
+       lwz     r20,  188(SP)
+       lwz     r19,  192(SP)
+       lwz     r18,  196(SP)
+       lwz     r17,  200(SP)
+       lwz     r16,  204(SP)
+       lwz     r15,  208(SP)
+       lwz     r14,  212(SP)
+       lwz     r13,  216(SP)
+#endif
+
+       addi    SP, SP, STACKSIZE
+
+       blr
+
+       EPILOGUE
+#endif
diff --git a/kernel/power/dtrmm_logic_16x4_power8.S b/kernel/power/dtrmm_logic_16x4_power8.S
new file mode 100644 (file)
index 0000000..f2886f8
--- /dev/null
@@ -0,0 +1,2202 @@
+       srawi.          J,      N,      2
+       ble             DTRMM_L4_END
+
+DTRMM_L4_BEGIN:
+
+       mr              CO,     C
+       mr              AO,     A
+       slwi            T1,     LDC     ,       2
+       add             C,      C,      T1
+
+#if defined(LEFT)
+       mr              KK,     OFFSET          // OFFSET -> KK
+#endif
+
+       srawi.          I,      M,      4
+       ble             DTRMM_L4x16_END
+
+DTRMM_L4x16_BEGIN:
+
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       mr              BO,     B                                       // B -> BO
+#else
+       mr              BO,     B                                       // B -> BO
+       slwi            T1,     KK,     5                               // Number of values in B shifted
+       slwi            T2,     KK,     7                               // Number of values in A shifted
+       add             BO,     BO,     T1                              // Add values to BO
+       add             AO,     AO,     T2                              // Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       sub             T1,     K,      KK                              // K - KK -> TEMP1
+#else
+       mr              T1,     KK                                      // KK -> KTEMP
+#ifdef LEFT
+       addi            T1,     T1,     16                              // KTEMP + Number of values in A -> KTEMP
+#else
+       addi            T1,     T1,     4                               // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+       mr              KKK,    T1
+       mr              K1,     T1
+       srawi.          L,      K1,     3                               // KTEMP / 8 -> L
+       ble             DTRMM_L4x16_SUB0
+       cmpwi           cr0,    L,      1
+       ble             DTRMM_L4x16_SUB4
+
+DTRMM_L4x16_LOOP_START:
+
+       dcbt            AO,     PRE
+       LOAD4x16_1
+       dcbt            AO,     PRE
+       KERNEL4x16_I1
+       dcbt            AO,     PRE
+       KERNEL4x16_2
+       dcbt            AO,     PRE
+       KERNEL4x16_1
+       dcbt            AO,     PRE
+       KERNEL4x16_2
+
+       dcbt            AO,     PRE
+       KERNEL4x16_1
+       dcbt            AO,     PRE
+       KERNEL4x16_2
+       dcbt            AO,     PRE
+       KERNEL4x16_1
+       dcbt            AO,     PRE
+       KERNEL4x16_2
+
+       addic.          L,      L,      -2
+       ble             DTRMM_L4x16_LOOP_END
+
+       .align 5
+
+DTRMM_L4x16_LOOP:
+
+       dcbt            AO,     PRE
+       KERNEL4x16_1
+       dcbt            AO,     PRE
+       KERNEL4x16_2
+       dcbt            AO,     PRE
+       KERNEL4x16_1
+       dcbt            AO,     PRE
+       KERNEL4x16_2
+
+       dcbt            AO,     PRE
+       KERNEL4x16_1
+       dcbt            AO,     PRE
+       KERNEL4x16_2
+       dcbt            AO,     PRE
+       KERNEL4x16_1
+       dcbt            AO,     PRE
+       KERNEL4x16_2
+
+       addic.          L,      L,      -1
+       bgt             DTRMM_L4x16_LOOP
+
+DTRMM_L4x16_LOOP_END:
+
+       dcbt            AO,     PRE
+       KERNEL4x16_1
+       dcbt            AO,     PRE
+       KERNEL4x16_2
+       dcbt            AO,     PRE
+       KERNEL4x16_1
+       dcbt            AO,     PRE
+       KERNEL4x16_2
+
+       dcbt            AO,     PRE
+       KERNEL4x16_1
+       dcbt            AO,     PRE
+       KERNEL4x16_2
+       dcbt            AO,     PRE
+       KERNEL4x16_1
+       KERNEL4x16_E2
+
+       b               DTRMM_L4x16_SUB1
+
+DTRMM_L4x16_SUB4:
+
+       dcbt            AO,     PRE
+       KERNEL4x16_SUBI1
+       dcbt            AO,     PRE
+       KERNEL4x16_SUB1
+       dcbt            AO,     PRE
+       KERNEL4x16_SUB1
+       dcbt            AO,     PRE
+       KERNEL4x16_SUB1
+
+       KERNEL4x16_SUB1
+       KERNEL4x16_SUB1
+       KERNEL4x16_SUB1
+       KERNEL4x16_SUB1
+
+       b               DTRMM_L4x16_SUB1
+
+DTRMM_L4x16_SUB0:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+
+       KERNEL4x16_SUBI1
+
+       addic.          L,      L,      -1
+       ble             DTRMM_L4x16_SAVE
+       b               DTRMM_L4x16_SUB2
+
+DTRMM_L4x16_SUB1:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+       ble             DTRMM_L4x16_SAVE
+
+DTRMM_L4x16_SUB2:
+
+       KERNEL4x16_SUB1
+
+       addic.          L,      L,      -1
+       bgt             DTRMM_L4x16_SUB2
+
+DTRMM_L4x16_SAVE:
+
+       SAVE4x16
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
+       slwi            T2,     T1,     5                       // TEMP1 * Number of values in B shifted -> TEMP2
+       slwi            T1,     T1,     7                       // TEMP1 * Number of values in A shifted -> TEMP1
+       add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
+       add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+       addi            KK,     KK,     16                              // KK += Number of values in A
+#endif
+
+
+       addic.          I,      I,      -1
+       bgt             DTRMM_L4x16_BEGIN
+
+DTRMM_L4x16_END:
+
+DTRMM_L4x8_BEGIN:
+       andi.           T2,     M,      15
+       ble             DTRMM_L4x1_END
+
+       andi.           T1,     M,      8
+       ble             DTRMM_L4x8_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       mr              BO,     B                                       // B -> BO
+#else
+       mr              BO,     B                                       // B -> BO
+       slwi            T1,     KK,     5                               // Number of values in B shifted
+       slwi            T2,     KK,     6                               // Number of values in A shifted
+       add             BO,     BO,     T1                              // Add values to BO
+       add             AO,     AO,     T2                              // Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       sub             T1,     K,      KK                              // K - KK -> TEMP1
+#else
+       mr              T1,     KK                                      // KK -> KTEMP
+#ifdef LEFT
+       addi            T1,     T1,     8                               // KTEMP + Number of values in A -> KTEMP
+#else
+       addi            T1,     T1,     4                               // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+       mr              KKK,    T1
+       mr              K1,     T1
+       srawi.          L,      K1,     3                               // KTEMP / 8 -> L
+       ble             DTRMM_L4x8_SUB0
+       cmpwi           cr0,    L,      1
+       ble             DTRMM_L4x8_SUB4
+
+DTRMM_L4x8_LOOP_START:
+
+       LOAD4x8_1
+       KERNEL4x8_I1
+       KERNEL4x8_2
+       KERNEL4x8_1
+       KERNEL4x8_2
+
+       KERNEL4x8_1
+       KERNEL4x8_2
+       KERNEL4x8_1
+       KERNEL4x8_2
+
+       addic.          L,      L,      -2
+       ble             DTRMM_L4x8_LOOP_END
+
+       .align 5
+
+DTRMM_L4x8_LOOP:
+
+       KERNEL4x8_1
+       KERNEL4x8_2
+       KERNEL4x8_1
+       KERNEL4x8_2
+
+       KERNEL4x8_1
+       KERNEL4x8_2
+       KERNEL4x8_1
+       KERNEL4x8_2
+
+       addic.          L,      L,      -1
+       bgt             DTRMM_L4x8_LOOP
+
+DTRMM_L4x8_LOOP_END:
+
+       KERNEL4x8_1
+       KERNEL4x8_2
+       KERNEL4x8_1
+       KERNEL4x8_2
+
+       KERNEL4x8_1
+       KERNEL4x8_2
+       KERNEL4x8_1
+       KERNEL4x8_E2
+
+       b               DTRMM_L4x8_SUB1
+
+DTRMM_L4x8_SUB4:
+
+       KERNEL4x8_SUBI1
+       KERNEL4x8_SUB1
+       KERNEL4x8_SUB1
+       KERNEL4x8_SUB1
+
+       KERNEL4x8_SUB1
+       KERNEL4x8_SUB1
+       KERNEL4x8_SUB1
+       KERNEL4x8_SUB1
+
+       b               DTRMM_L4x8_SUB1
+
+DTRMM_L4x8_SUB0:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+
+       KERNEL4x8_SUBI1
+
+       addic.          L,      L,      -1
+       ble             DTRMM_L4x8_SAVE
+       b               DTRMM_L4x8_SUB2
+
+DTRMM_L4x8_SUB1:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+       ble             DTRMM_L4x8_SAVE
+
+DTRMM_L4x8_SUB2:
+
+       KERNEL4x8_SUB1
+
+       addic.          L,      L,      -1
+       bgt             DTRMM_L4x8_SUB2
+
+DTRMM_L4x8_SAVE:
+
+       SAVE4x8
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
+       slwi            T2,     T1,     5                       // TEMP1 * Number of values in B shifted -> TEMP2
+       slwi            T1,     T1,     6                       // TEMP1 * Number of values in A shifted -> TEMP1
+       add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
+       add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+       addi            KK,     KK,     8                               // KK += Number of values in A
+#endif
+
+
+DTRMM_L4x8_END:
+
+DTRMM_L4x4_BEGIN:
+
+       andi.           T1,     M,      4
+       ble             DTRMM_L4x4_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       mr              BO,     B                                       // B -> BO
+#else
+       mr              BO,     B                                       // B -> BO
+       slwi            T1,     KK,     5                               // Number of values in B shifted
+       slwi            T2,     KK,     5                               // Number of values in A shifted
+       add             BO,     BO,     T1                              // Add values to BO
+       add             AO,     AO,     T2                              // Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       sub             T1,     K,      KK                              // K - KK -> TEMP1
+#else
+       mr              T1,     KK                                      // KK -> KTEMP
+#ifdef LEFT
+       addi            T1,     T1,     4                               // KTEMP + Number of values in A -> KTEMP
+#else
+       addi            T1,     T1,     4                               // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+       mr              KKK,    T1
+       mr              K1,     T1
+       srawi.          L,      K1,     3                               // KTEMP / 8 -> L
+       ble             DTRMM_L4x4_SUB0
+       cmpwi           cr0,    L,      1
+       ble             DTRMM_L4x4_SUB4
+
+DTRMM_L4x4_LOOP_START:
+
+       LOAD4x4_1
+       KERNEL4x4_I1
+       KERNEL4x4_2
+       KERNEL4x4_1
+       KERNEL4x4_2
+
+       KERNEL4x4_1
+       KERNEL4x4_2
+       KERNEL4x4_1
+       KERNEL4x4_2
+
+       addic.          L,      L,      -2
+       ble             DTRMM_L4x4_LOOP_END
+
+       .align 5
+
+DTRMM_L4x4_LOOP:
+
+       KERNEL4x4_1
+       KERNEL4x4_2
+       KERNEL4x4_1
+       KERNEL4x4_2
+
+       KERNEL4x4_1
+       KERNEL4x4_2
+       KERNEL4x4_1
+       KERNEL4x4_2
+
+       addic.          L,      L,      -1
+       bgt             DTRMM_L4x4_LOOP
+
+DTRMM_L4x4_LOOP_END:
+
+       KERNEL4x4_1
+       KERNEL4x4_2
+       KERNEL4x4_1
+       KERNEL4x4_2
+
+       KERNEL4x4_1
+       KERNEL4x4_2
+       KERNEL4x4_1
+       KERNEL4x4_E2
+
+       b               DTRMM_L4x4_SUB1
+
+DTRMM_L4x4_SUB4:
+
+       KERNEL4x4_SUBI1
+       KERNEL4x4_SUB1
+       KERNEL4x4_SUB1
+       KERNEL4x4_SUB1
+
+       KERNEL4x4_SUB1
+       KERNEL4x4_SUB1
+       KERNEL4x4_SUB1
+       KERNEL4x4_SUB1
+
+       b               DTRMM_L4x4_SUB1
+
+DTRMM_L4x4_SUB0:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+
+       KERNEL4x4_SUBI1
+
+       addic.          L,      L,      -1
+       ble             DTRMM_L4x4_SAVE
+       b               DTRMM_L4x4_SUB2
+
+DTRMM_L4x4_SUB1:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+       ble             DTRMM_L4x4_SAVE
+
+DTRMM_L4x4_SUB2:
+
+       KERNEL4x4_SUB1
+
+       addic.          L,      L,      -1
+       bgt             DTRMM_L4x4_SUB2
+
+DTRMM_L4x4_SAVE:
+
+       SAVE4x4
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
+       slwi            T2,     T1,     5                       // TEMP1 * Number of values in B shifted -> TEMP2
+       slwi            T1,     T1,     5                       // TEMP1 * Number of values in A shifted -> TEMP1
+       add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
+       add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+       addi            KK,     KK,     4                               // KK += Number of values in A
+#endif
+
+
+DTRMM_L4x4_END:
+
+DTRMM_L4x2_BEGIN:
+
+       andi.           T1,     M,      2
+       ble             DTRMM_L4x2_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       mr              BO,     B                                       // B -> BO
+#else
+       mr              BO,     B                                       // B -> BO
+       slwi            T1,     KK,     5                               // Number of values in B shifted
+       slwi            T2,     KK,     4                               // Number of values in A shifted
+       add             BO,     BO,     T1                              // Add values to BO
+       add             AO,     AO,     T2                              // Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       sub             T1,     K,      KK                              // K - KK -> TEMP1
+#else
+       mr              T1,     KK                                      // KK -> KTEMP
+#ifdef LEFT
+       addi            T1,     T1,     2                               // KTEMP + Number of values in A -> KTEMP
+#else
+       addi            T1,     T1,     4                               // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+       mr              KKK,    T1
+       mr              K1,     T1
+       srawi.          L,      K1,     3                               // KTEMP / 8 -> L
+       ble             DTRMM_L4x2_SUB0
+       cmpwi           cr0,    L,      1
+       ble             DTRMM_L4x2_SUB4
+
+DTRMM_L4x2_LOOP_START:
+
+       LOAD4x2_1
+       KERNEL4x2_I1
+       KERNEL4x2_2
+       KERNEL4x2_1
+       KERNEL4x2_2
+
+       KERNEL4x2_1
+       KERNEL4x2_2
+       KERNEL4x2_1
+       KERNEL4x2_2
+
+       addic.          L,      L,      -2
+       ble             DTRMM_L4x2_LOOP_END
+
+       .align 5
+
+DTRMM_L4x2_LOOP:
+
+       KERNEL4x2_1
+       KERNEL4x2_2
+       KERNEL4x2_1
+       KERNEL4x2_2
+
+       KERNEL4x2_1
+       KERNEL4x2_2
+       KERNEL4x2_1
+       KERNEL4x2_2
+
+       addic.          L,      L,      -1
+       bgt             DTRMM_L4x2_LOOP
+
+DTRMM_L4x2_LOOP_END:
+
+       KERNEL4x2_1
+       KERNEL4x2_2
+       KERNEL4x2_1
+       KERNEL4x2_2
+
+       KERNEL4x2_1
+       KERNEL4x2_2
+       KERNEL4x2_1
+       KERNEL4x2_E2
+
+       b               DTRMM_L4x2_SUB1
+
+DTRMM_L4x2_SUB4:
+
+       KERNEL4x2_SUBI1
+       KERNEL4x2_SUB1
+       KERNEL4x2_SUB1
+       KERNEL4x2_SUB1
+
+       KERNEL4x2_SUB1
+       KERNEL4x2_SUB1
+       KERNEL4x2_SUB1
+       KERNEL4x2_SUB1
+
+       b               DTRMM_L4x2_SUB1
+
+DTRMM_L4x2_SUB0:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+
+       KERNEL4x2_SUBI1
+
+       addic.          L,      L,      -1
+       ble             DTRMM_L4x2_SAVE
+       b               DTRMM_L4x2_SUB2
+
+DTRMM_L4x2_SUB1:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+       ble             DTRMM_L4x2_SAVE
+
+DTRMM_L4x2_SUB2:
+
+       KERNEL4x2_SUB1
+
+       addic.          L,      L,      -1
+       bgt             DTRMM_L4x2_SUB2
+
+DTRMM_L4x2_SAVE:
+
+       SAVE4x2
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
+       slwi            T2,     T1,     5                       // TEMP1 * Number of values in B shifted -> TEMP2
+       slwi            T1,     T1,     4                       // TEMP1 * Number of values in A shifted -> TEMP1
+       add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
+       add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+       addi            KK,     KK,     2                               // KK += Number of values in A
+#endif
+
+
+DTRMM_L4x2_END:
+
+DTRMM_L4x1_BEGIN:
+
+       andi.           T1,     M,      1
+       ble             DTRMM_L4x1_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       mr              BO,     B                                       // B -> BO
+#else
+       mr              BO,     B                                       // B -> BO
+       slwi            T1,     KK,     5                               // Number of values in B shifted
+       slwi            T2,     KK,     3                               // Number of values in A shifted
+       add             BO,     BO,     T1                              // Add values to BO
+       add             AO,     AO,     T2                              // Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       sub             T1,     K,      KK                              // K - KK -> TEMP1
+#else
+       mr              T1,     KK                                      // KK -> KTEMP
+#ifdef LEFT
+       addi            T1,     T1,     1                               // KTEMP + Number of values in A -> KTEMP
+#else
+       addi            T1,     T1,     4                               // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+       mr              KKK,    T1
+       mr              K1,     T1
+       srawi.          L,      K1,     3                               // KTEMP / 8 -> L
+       ble             DTRMM_L4x1_SUB0
+       cmpwi           cr0,    L,      1
+       ble             DTRMM_L4x1_SUB4
+
+DTRMM_L4x1_LOOP_START:
+
+       LOAD4x1_1
+       KERNEL4x1_I1
+       KERNEL4x1_2
+       KERNEL4x1_1
+       KERNEL4x1_2
+
+       KERNEL4x1_1
+       KERNEL4x1_2
+       KERNEL4x1_1
+       KERNEL4x1_2
+
+       addic.          L,      L,      -2
+       ble             DTRMM_L4x1_LOOP_END
+
+       .align 5
+
+DTRMM_L4x1_LOOP:
+
+       KERNEL4x1_1
+       KERNEL4x1_2
+       KERNEL4x1_1
+       KERNEL4x1_2
+
+       KERNEL4x1_1
+       KERNEL4x1_2
+       KERNEL4x1_1
+       KERNEL4x1_2
+
+       addic.          L,      L,      -1
+       bgt             DTRMM_L4x1_LOOP
+
+DTRMM_L4x1_LOOP_END:
+
+       KERNEL4x1_1
+       KERNEL4x1_2
+       KERNEL4x1_1
+       KERNEL4x1_2
+
+       KERNEL4x1_1
+       KERNEL4x1_2
+       KERNEL4x1_1
+       KERNEL4x1_E2
+
+       b               DTRMM_L4x1_SUB1
+
+DTRMM_L4x1_SUB4:
+
+       KERNEL4x1_SUBI1
+       KERNEL4x1_SUB1
+       KERNEL4x1_SUB1
+       KERNEL4x1_SUB1
+
+       KERNEL4x1_SUB1
+       KERNEL4x1_SUB1
+       KERNEL4x1_SUB1
+       KERNEL4x1_SUB1
+
+       b               DTRMM_L4x1_SUB1
+
+DTRMM_L4x1_SUB0:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+
+       KERNEL4x1_SUBI1
+
+       addic.          L,      L,      -1
+       ble             DTRMM_L4x1_SAVE
+       b               DTRMM_L4x1_SUB2
+
+DTRMM_L4x1_SUB1:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+       ble             DTRMM_L4x1_SAVE
+
+DTRMM_L4x1_SUB2:
+
+       KERNEL4x1_SUB1
+
+       addic.          L,      L,      -1
+       bgt             DTRMM_L4x1_SUB2
+
+DTRMM_L4x1_SAVE:
+
+       SAVE4x1
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
+       slwi            T2,     T1,     5                       // TEMP1 * Number of values in B shifted -> TEMP2
+       slwi            T1,     T1,     3                       // TEMP1 * Number of values in A shifted -> TEMP1
+       add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
+       add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+       addi            KK,     KK,     1                               // KK += Number of values in A
+#endif
+
+
+DTRMM_L4x1_END:
+
+       slwi            T1,     K,      5
+       add             B,      B,      T1
+
+#if !defined(LEFT)
+       addi            KK,     KK,     4                                       // KK += Number of values in B
+#endif
+
+
+       addic.          J,      J,      -1
+       bgt             DTRMM_L4_BEGIN
+
+       andi.           T2,     N,      3
+       ble             L999
+
+DTRMM_L4_END:
+
+       b               DTRMM_L2_BEGIN
+
+L999_H1:
+
+       b               L999
+
+DTRMM_L2_BEGIN:
+
+       andi.           T1,     N,      2
+       ble             DTRMM_L2_END
+       mr              CO,     C
+       mr              AO,     A
+       slwi            T1,     LDC     ,       1
+       add             C,      C,      T1
+
+#if defined(LEFT)
+       mr              KK,     OFFSET          // OFFSET -> KK
+#endif
+
+       srawi.          I,      M,      4
+       ble             DTRMM_L2x16_END
+
+DTRMM_L2x16_BEGIN:
+
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       mr              BO,     B                                       // B -> BO
+#else
+       mr              BO,     B                                       // B -> BO
+       slwi            T1,     KK,     4                               // Number of values in B shifted
+       slwi            T2,     KK,     7                               // Number of values in A shifted
+       add             BO,     BO,     T1                              // Add values to BO
+       add             AO,     AO,     T2                              // Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       sub             T1,     K,      KK                              // K - KK -> TEMP1
+#else
+       mr              T1,     KK                                      // KK -> KTEMP
+#ifdef LEFT
+       addi            T1,     T1,     16                              // KTEMP + Number of values in A -> KTEMP
+#else
+       addi            T1,     T1,     2                               // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+       mr              KKK,    T1
+       mr              K1,     T1
+       srawi.          L,      K1,     3                               // KTEMP / 8 -> L
+       ble             DTRMM_L2x16_SUB0
+       cmpwi           cr0,    L,      1
+       ble             DTRMM_L2x16_SUB4
+
+DTRMM_L2x16_LOOP_START:
+
+       dcbt            AO,     PRE
+       LOAD2x16_1
+       dcbt            AO,     PRE
+       KERNEL2x16_I1
+       dcbt            AO,     PRE
+       KERNEL2x16_2
+       dcbt            AO,     PRE
+       KERNEL2x16_1
+       dcbt            AO,     PRE
+       KERNEL2x16_2
+
+       dcbt            AO,     PRE
+       KERNEL2x16_1
+       dcbt            AO,     PRE
+       KERNEL2x16_2
+       dcbt            AO,     PRE
+       KERNEL2x16_1
+       dcbt            AO,     PRE
+       KERNEL2x16_2
+
+       addic.          L,      L,      -2
+       ble             DTRMM_L2x16_LOOP_END
+
+       .align 5
+
+DTRMM_L2x16_LOOP:
+
+       dcbt            AO,     PRE
+       KERNEL2x16_1
+       dcbt            AO,     PRE
+       KERNEL2x16_2
+       dcbt            AO,     PRE
+       KERNEL2x16_1
+       dcbt            AO,     PRE
+       KERNEL2x16_2
+
+       dcbt            AO,     PRE
+       KERNEL2x16_1
+       dcbt            AO,     PRE
+       KERNEL2x16_2
+       dcbt            AO,     PRE
+       KERNEL2x16_1
+       dcbt            AO,     PRE
+       KERNEL2x16_2
+
+       addic.          L,      L,      -1
+       bgt             DTRMM_L2x16_LOOP
+
+DTRMM_L2x16_LOOP_END:
+
+       dcbt            AO,     PRE
+       KERNEL2x16_1
+       dcbt            AO,     PRE
+       KERNEL2x16_2
+       dcbt            AO,     PRE
+       KERNEL2x16_1
+       dcbt            AO,     PRE
+       KERNEL2x16_2
+
+       dcbt            AO,     PRE
+       KERNEL2x16_1
+       dcbt            AO,     PRE
+       KERNEL2x16_2
+       dcbt            AO,     PRE
+       KERNEL2x16_1
+       KERNEL2x16_E2
+
+       b               DTRMM_L2x16_SUB1
+
+DTRMM_L2x16_SUB4:
+
+       dcbt            AO,     PRE
+       KERNEL2x16_SUBI1
+       dcbt            AO,     PRE
+       KERNEL2x16_SUB1
+       dcbt            AO,     PRE
+       KERNEL2x16_SUB1
+       dcbt            AO,     PRE
+       KERNEL2x16_SUB1
+
+       KERNEL2x16_SUB1
+       KERNEL2x16_SUB1
+       KERNEL2x16_SUB1
+       KERNEL2x16_SUB1
+
+       b               DTRMM_L2x16_SUB1
+
+DTRMM_L2x16_SUB0:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+
+       KERNEL2x16_SUBI1
+
+       addic.          L,      L,      -1
+       ble             DTRMM_L2x16_SAVE
+       b               DTRMM_L2x16_SUB2
+
+DTRMM_L2x16_SUB1:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+       ble             DTRMM_L2x16_SAVE
+
+DTRMM_L2x16_SUB2:
+
+       KERNEL2x16_SUB1
+
+       addic.          L,      L,      -1
+       bgt             DTRMM_L2x16_SUB2
+
+DTRMM_L2x16_SAVE:
+
+       SAVE2x16
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
+       slwi            T2,     T1,     4                       // TEMP1 * Number of values in B shifted -> TEMP2
+       slwi            T1,     T1,     7                       // TEMP1 * Number of values in A shifted -> TEMP1
+       add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
+       add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+       addi            KK,     KK,     16                              // KK += Number of values in A
+#endif
+
+
+       addic.          I,      I,      -1
+       bgt             DTRMM_L2x16_BEGIN
+
+DTRMM_L2x16_END:
+
+DTRMM_L2x8_BEGIN:
+       andi.           T2,     M,      15
+       ble             DTRMM_L2x1_END
+
+       andi.           T1,     M,      8
+       ble             DTRMM_L2x8_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       mr              BO,     B                                       // B -> BO
+#else
+       mr              BO,     B                                       // B -> BO
+       slwi            T1,     KK,     4                               // Number of values in B shifted
+       slwi            T2,     KK,     6                               // Number of values in A shifted
+       add             BO,     BO,     T1                              // Add values to BO
+       add             AO,     AO,     T2                              // Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       sub             T1,     K,      KK                              // K - KK -> TEMP1
+#else
+       mr              T1,     KK                                      // KK -> KTEMP
+#ifdef LEFT
+       addi            T1,     T1,     8                               // KTEMP + Number of values in A -> KTEMP
+#else
+       addi            T1,     T1,     2                               // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+       mr              KKK,    T1
+       mr              K1,     T1
+       srawi.          L,      K1,     3                               // KTEMP / 8 -> L
+       ble             DTRMM_L2x8_SUB0
+       cmpwi           cr0,    L,      1
+       ble             DTRMM_L2x8_SUB4
+
+DTRMM_L2x8_LOOP_START:
+
+       LOAD2x8_1
+       KERNEL2x8_I1
+       KERNEL2x8_2
+       KERNEL2x8_1
+       KERNEL2x8_2
+
+       KERNEL2x8_1
+       KERNEL2x8_2
+       KERNEL2x8_1
+       KERNEL2x8_2
+
+       addic.          L,      L,      -2
+       ble             DTRMM_L2x8_LOOP_END
+
+       .align 5
+
+DTRMM_L2x8_LOOP:
+
+       KERNEL2x8_1
+       KERNEL2x8_2
+       KERNEL2x8_1
+       KERNEL2x8_2
+
+       KERNEL2x8_1
+       KERNEL2x8_2
+       KERNEL2x8_1
+       KERNEL2x8_2
+
+       addic.          L,      L,      -1
+       bgt             DTRMM_L2x8_LOOP
+
+DTRMM_L2x8_LOOP_END:
+
+       KERNEL2x8_1
+       KERNEL2x8_2
+       KERNEL2x8_1
+       KERNEL2x8_2
+
+       KERNEL2x8_1
+       KERNEL2x8_2
+       KERNEL2x8_1
+       KERNEL2x8_E2
+
+       b               DTRMM_L2x8_SUB1
+
+DTRMM_L2x8_SUB4:
+
+       KERNEL2x8_SUBI1
+       KERNEL2x8_SUB1
+       KERNEL2x8_SUB1
+       KERNEL2x8_SUB1
+
+       KERNEL2x8_SUB1
+       KERNEL2x8_SUB1
+       KERNEL2x8_SUB1
+       KERNEL2x8_SUB1
+
+       b               DTRMM_L2x8_SUB1
+
+DTRMM_L2x8_SUB0:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+
+       KERNEL2x8_SUBI1
+
+       addic.          L,      L,      -1
+       ble             DTRMM_L2x8_SAVE
+       b               DTRMM_L2x8_SUB2
+
+DTRMM_L2x8_SUB1:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+       ble             DTRMM_L2x8_SAVE
+
+DTRMM_L2x8_SUB2:
+
+       KERNEL2x8_SUB1
+
+       addic.          L,      L,      -1
+       bgt             DTRMM_L2x8_SUB2
+
+DTRMM_L2x8_SAVE:
+
+       SAVE2x8
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
+       slwi            T2,     T1,     4                       // TEMP1 * Number of values in B shifted -> TEMP2
+       slwi            T1,     T1,     6                       // TEMP1 * Number of values in A shifted -> TEMP1
+       add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
+       add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+       addi            KK,     KK,     8                               // KK += Number of values in A
+#endif
+
+
+DTRMM_L2x8_END:
+
+DTRMM_L2x4_BEGIN:
+
+       andi.           T1,     M,      4
+       ble             DTRMM_L2x4_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       mr              BO,     B                                       // B -> BO
+#else
+       mr              BO,     B                                       // B -> BO
+       slwi            T1,     KK,     4                               // Number of values in B shifted
+       slwi            T2,     KK,     5                               // Number of values in A shifted
+       add             BO,     BO,     T1                              // Add values to BO
+       add             AO,     AO,     T2                              // Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       sub             T1,     K,      KK                              // K - KK -> TEMP1
+#else
+       mr              T1,     KK                                      // KK -> KTEMP
+#ifdef LEFT
+       addi            T1,     T1,     4                               // KTEMP + Number of values in A -> KTEMP
+#else
+       addi            T1,     T1,     2                               // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+       mr              KKK,    T1
+       mr              K1,     T1
+       srawi.          L,      K1,     3                               // KTEMP / 8 -> L
+       ble             DTRMM_L2x4_SUB0
+       cmpwi           cr0,    L,      1
+       ble             DTRMM_L2x4_SUB4
+
+DTRMM_L2x4_LOOP_START:
+
+       LOAD2x4_1
+       KERNEL2x4_I1
+       KERNEL2x4_2
+       KERNEL2x4_1
+       KERNEL2x4_2
+
+       KERNEL2x4_1
+       KERNEL2x4_2
+       KERNEL2x4_1
+       KERNEL2x4_2
+
+       addic.          L,      L,      -2
+       ble             DTRMM_L2x4_LOOP_END
+
+       .align 5
+
+DTRMM_L2x4_LOOP:
+
+       KERNEL2x4_1
+       KERNEL2x4_2
+       KERNEL2x4_1
+       KERNEL2x4_2
+
+       KERNEL2x4_1
+       KERNEL2x4_2
+       KERNEL2x4_1
+       KERNEL2x4_2
+
+       addic.          L,      L,      -1
+       bgt             DTRMM_L2x4_LOOP
+
+DTRMM_L2x4_LOOP_END:
+
+       KERNEL2x4_1
+       KERNEL2x4_2
+       KERNEL2x4_1
+       KERNEL2x4_2
+
+       KERNEL2x4_1
+       KERNEL2x4_2
+       KERNEL2x4_1
+       KERNEL2x4_E2
+
+       b               DTRMM_L2x4_SUB1
+
+DTRMM_L2x4_SUB4:
+
+       KERNEL2x4_SUBI1
+       KERNEL2x4_SUB1
+       KERNEL2x4_SUB1
+       KERNEL2x4_SUB1
+
+       KERNEL2x4_SUB1
+       KERNEL2x4_SUB1
+       KERNEL2x4_SUB1
+       KERNEL2x4_SUB1
+
+       b               DTRMM_L2x4_SUB1
+
+DTRMM_L2x4_SUB0:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+
+       KERNEL2x4_SUBI1
+
+       addic.          L,      L,      -1
+       ble             DTRMM_L2x4_SAVE
+       b               DTRMM_L2x4_SUB2
+
+DTRMM_L2x4_SUB1:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+       ble             DTRMM_L2x4_SAVE
+
+DTRMM_L2x4_SUB2:
+
+       KERNEL2x4_SUB1
+
+       addic.          L,      L,      -1
+       bgt             DTRMM_L2x4_SUB2
+
+DTRMM_L2x4_SAVE:
+
+       SAVE2x4
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
+       slwi            T2,     T1,     4                       // TEMP1 * Number of values in B shifted -> TEMP2
+       slwi            T1,     T1,     5                       // TEMP1 * Number of values in A shifted -> TEMP1
+       add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
+       add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+       addi            KK,     KK,     4                               // KK += Number of values in A
+#endif
+
+
+DTRMM_L2x4_END:
+
+DTRMM_L2x2_BEGIN:
+
+       andi.           T1,     M,      2
+       ble             DTRMM_L2x2_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       mr              BO,     B                                       // B -> BO
+#else
+       mr              BO,     B                                       // B -> BO
+       slwi            T1,     KK,     4                               // Number of values in B shifted
+       slwi            T2,     KK,     4                               // Number of values in A shifted
+       add             BO,     BO,     T1                              // Add values to BO
+       add             AO,     AO,     T2                              // Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       sub             T1,     K,      KK                              // K - KK -> TEMP1
+#else
+       mr              T1,     KK                                      // KK -> KTEMP
+#ifdef LEFT
+       addi            T1,     T1,     2                               // KTEMP + Number of values in A -> KTEMP
+#else
+       addi            T1,     T1,     2                               // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+       mr              KKK,    T1
+       mr              K1,     T1
+       srawi.          L,      K1,     3                               // KTEMP / 8 -> L
+       ble             DTRMM_L2x2_SUB0
+       cmpwi           cr0,    L,      1
+       ble             DTRMM_L2x2_SUB4
+
+DTRMM_L2x2_LOOP_START:
+
+       LOAD2x2_1
+       KERNEL2x2_I1
+       KERNEL2x2_2
+       KERNEL2x2_1
+       KERNEL2x2_2
+
+       KERNEL2x2_1
+       KERNEL2x2_2
+       KERNEL2x2_1
+       KERNEL2x2_2
+
+       addic.          L,      L,      -2
+       ble             DTRMM_L2x2_LOOP_END
+
+       .align 5
+
+DTRMM_L2x2_LOOP:
+
+       KERNEL2x2_1
+       KERNEL2x2_2
+       KERNEL2x2_1
+       KERNEL2x2_2
+
+       KERNEL2x2_1
+       KERNEL2x2_2
+       KERNEL2x2_1
+       KERNEL2x2_2
+
+       addic.          L,      L,      -1
+       bgt             DTRMM_L2x2_LOOP
+
+DTRMM_L2x2_LOOP_END:
+
+       KERNEL2x2_1
+       KERNEL2x2_2
+       KERNEL2x2_1
+       KERNEL2x2_2
+
+       KERNEL2x2_1
+       KERNEL2x2_2
+       KERNEL2x2_1
+       KERNEL2x2_E2
+
+       b               DTRMM_L2x2_SUB1
+
+DTRMM_L2x2_SUB4:
+
+       KERNEL2x2_SUBI1
+       KERNEL2x2_SUB1
+       KERNEL2x2_SUB1
+       KERNEL2x2_SUB1
+
+       KERNEL2x2_SUB1
+       KERNEL2x2_SUB1
+       KERNEL2x2_SUB1
+       KERNEL2x2_SUB1
+
+       b               DTRMM_L2x2_SUB1
+
+DTRMM_L2x2_SUB0:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+
+       KERNEL2x2_SUBI1
+
+       addic.          L,      L,      -1
+       ble             DTRMM_L2x2_SAVE
+       b               DTRMM_L2x2_SUB2
+
+DTRMM_L2x2_SUB1:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+       ble             DTRMM_L2x2_SAVE
+
+DTRMM_L2x2_SUB2:
+
+       KERNEL2x2_SUB1
+
+       addic.          L,      L,      -1
+       bgt             DTRMM_L2x2_SUB2
+
+DTRMM_L2x2_SAVE:
+
+       SAVE2x2
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
+       slwi            T2,     T1,     4                       // TEMP1 * Number of values in B shifted -> TEMP2
+       slwi            T1,     T1,     4                       // TEMP1 * Number of values in A shifted -> TEMP1
+       add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
+       add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+       addi            KK,     KK,     2                               // KK += Number of values in A
+#endif
+
+
+DTRMM_L2x2_END:
+
+DTRMM_L2x1_BEGIN:
+
+       andi.           T1,     M,      1
+       ble             DTRMM_L2x1_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       mr              BO,     B                                       // B -> BO
+#else
+       mr              BO,     B                                       // B -> BO
+       slwi            T1,     KK,     4                               // Number of values in B shifted
+       slwi            T2,     KK,     3                               // Number of values in A shifted
+       add             BO,     BO,     T1                              // Add values to BO
+       add             AO,     AO,     T2                              // Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       sub             T1,     K,      KK                              // K - KK -> TEMP1
+#else
+       mr              T1,     KK                                      // KK -> KTEMP
+#ifdef LEFT
+       addi            T1,     T1,     1                               // KTEMP + Number of values in A -> KTEMP
+#else
+       addi            T1,     T1,     2                               // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+       mr              KKK,    T1
+       mr              K1,     T1
+       srawi.          L,      K1,     3                               // KTEMP / 8 -> L
+       ble             DTRMM_L2x1_SUB0
+       cmpwi           cr0,    L,      1
+       ble             DTRMM_L2x1_SUB4
+
+DTRMM_L2x1_LOOP_START:
+
+       LOAD2x1_1
+       KERNEL2x1_I1
+       KERNEL2x1_2
+       KERNEL2x1_1
+       KERNEL2x1_2
+
+       KERNEL2x1_1
+       KERNEL2x1_2
+       KERNEL2x1_1
+       KERNEL2x1_2
+
+       addic.          L,      L,      -2
+       ble             DTRMM_L2x1_LOOP_END
+
+       .align 5
+
+DTRMM_L2x1_LOOP:
+
+       KERNEL2x1_1
+       KERNEL2x1_2
+       KERNEL2x1_1
+       KERNEL2x1_2
+
+       KERNEL2x1_1
+       KERNEL2x1_2
+       KERNEL2x1_1
+       KERNEL2x1_2
+
+       addic.          L,      L,      -1
+       bgt             DTRMM_L2x1_LOOP
+
+DTRMM_L2x1_LOOP_END:
+
+       KERNEL2x1_1
+       KERNEL2x1_2
+       KERNEL2x1_1
+       KERNEL2x1_2
+
+       KERNEL2x1_1
+       KERNEL2x1_2
+       KERNEL2x1_1
+       KERNEL2x1_E2
+
+       b               DTRMM_L2x1_SUB1
+
+DTRMM_L2x1_SUB4:
+
+       KERNEL2x1_SUBI1
+       KERNEL2x1_SUB1
+       KERNEL2x1_SUB1
+       KERNEL2x1_SUB1
+
+       KERNEL2x1_SUB1
+       KERNEL2x1_SUB1
+       KERNEL2x1_SUB1
+       KERNEL2x1_SUB1
+
+       b               DTRMM_L2x1_SUB1
+
+DTRMM_L2x1_SUB0:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+
+       KERNEL2x1_SUBI1
+
+       addic.          L,      L,      -1
+       ble             DTRMM_L2x1_SAVE
+       b               DTRMM_L2x1_SUB2
+
+DTRMM_L2x1_SUB1:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+       ble             DTRMM_L2x1_SAVE
+
+DTRMM_L2x1_SUB2:
+
+       KERNEL2x1_SUB1
+
+       addic.          L,      L,      -1
+       bgt             DTRMM_L2x1_SUB2
+
+DTRMM_L2x1_SAVE:
+
+       SAVE2x1
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
+       slwi            T2,     T1,     4                       // TEMP1 * Number of values in B shifted -> TEMP2
+       slwi            T1,     T1,     3                       // TEMP1 * Number of values in A shifted -> TEMP1
+       add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
+       add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+       addi            KK,     KK,     1                               // KK += Number of values in A
+#endif
+
+
+DTRMM_L2x1_END:
+
+       slwi            T1,     K,      4
+       add             B,      B,      T1
+
+#if !defined(LEFT)
+       addi            KK,     KK,     2                                       // KK += Number of values in B
+#endif
+
+
+DTRMM_L2_END:
+DTRMM_L1_BEGIN:
+
+       andi.           T1,     N,      1
+       ble             DTRMM_L1_END
+       mr              CO,     C
+       mr              AO,     A
+
+#if defined(LEFT)
+       mr              KK,     OFFSET          // OFFSET -> KK
+#endif
+
+       srawi.          I,      M,      4
+       ble             DTRMM_L1x16_END
+
+DTRMM_L1x16_BEGIN:
+
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       mr              BO,     B                                       // B -> BO
+#else
+       mr              BO,     B                                       // B -> BO
+       slwi            T1,     KK,     3                               // Number of values in B shifted
+       slwi            T2,     KK,     7                               // Number of values in A shifted
+       add             BO,     BO,     T1                              // Add values to BO
+       add             AO,     AO,     T2                              // Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       sub             T1,     K,      KK                              // K - KK -> TEMP1
+#else
+       mr              T1,     KK                                      // KK -> KTEMP
+#ifdef LEFT
+       addi            T1,     T1,     16                              // KTEMP + Number of values in A -> KTEMP
+#else
+       addi            T1,     T1,     1                               // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+       mr              KKK,    T1
+       mr              K1,     T1
+       srawi.          L,      K1,     3                               // KTEMP / 8 -> L
+       ble             DTRMM_L1x16_SUB0
+       cmpwi           cr0,    L,      1
+       ble             DTRMM_L1x16_SUB4
+
+DTRMM_L1x16_LOOP_START:
+
+       dcbt            AO,     PRE
+       LOAD1x16_1
+       dcbt            AO,     PRE
+       KERNEL1x16_I1
+       dcbt            AO,     PRE
+       KERNEL1x16_2
+       dcbt            AO,     PRE
+       KERNEL1x16_1
+       dcbt            AO,     PRE
+       KERNEL1x16_2
+
+       dcbt            AO,     PRE
+       KERNEL1x16_1
+       dcbt            AO,     PRE
+       KERNEL1x16_2
+       dcbt            AO,     PRE
+       KERNEL1x16_1
+       dcbt            AO,     PRE
+       KERNEL1x16_2
+
+       addic.          L,      L,      -2
+       ble             DTRMM_L1x16_LOOP_END
+
+       .align 5
+
+DTRMM_L1x16_LOOP:
+
+       dcbt            AO,     PRE
+       KERNEL1x16_1
+       dcbt            AO,     PRE
+       KERNEL1x16_2
+       dcbt            AO,     PRE
+       KERNEL1x16_1
+       dcbt            AO,     PRE
+       KERNEL1x16_2
+
+       dcbt            AO,     PRE
+       KERNEL1x16_1
+       dcbt            AO,     PRE
+       KERNEL1x16_2
+       dcbt            AO,     PRE
+       KERNEL1x16_1
+       dcbt            AO,     PRE
+       KERNEL1x16_2
+
+       addic.          L,      L,      -1
+       bgt             DTRMM_L1x16_LOOP
+
+DTRMM_L1x16_LOOP_END:
+
+       dcbt            AO,     PRE
+       KERNEL1x16_1
+       dcbt            AO,     PRE
+       KERNEL1x16_2
+       dcbt            AO,     PRE
+       KERNEL1x16_1
+       dcbt            AO,     PRE
+       KERNEL1x16_2
+
+       dcbt            AO,     PRE
+       KERNEL1x16_1
+       dcbt            AO,     PRE
+       KERNEL1x16_2
+       dcbt            AO,     PRE
+       KERNEL1x16_1
+       KERNEL1x16_E2
+
+       b               DTRMM_L1x16_SUB1
+
+DTRMM_L1x16_SUB4:
+
+       dcbt            AO,     PRE
+       KERNEL1x16_SUBI1
+       dcbt            AO,     PRE
+       KERNEL1x16_SUB1
+       dcbt            AO,     PRE
+       KERNEL1x16_SUB1
+       dcbt            AO,     PRE
+       KERNEL1x16_SUB1
+
+       KERNEL1x16_SUB1
+       KERNEL1x16_SUB1
+       KERNEL1x16_SUB1
+       KERNEL1x16_SUB1
+
+       b               DTRMM_L1x16_SUB1
+
+DTRMM_L1x16_SUB0:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+
+       KERNEL1x16_SUBI1
+
+       addic.          L,      L,      -1
+       ble             DTRMM_L1x16_SAVE
+       b               DTRMM_L1x16_SUB2
+
+DTRMM_L1x16_SUB1:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+       ble             DTRMM_L1x16_SAVE
+
+DTRMM_L1x16_SUB2:
+
+       KERNEL1x16_SUB1
+
+       addic.          L,      L,      -1
+       bgt             DTRMM_L1x16_SUB2
+
+DTRMM_L1x16_SAVE:
+
+       SAVE1x16
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
+       slwi            T2,     T1,     3                       // TEMP1 * Number of values in B shifted -> TEMP2
+       slwi            T1,     T1,     7                       // TEMP1 * Number of values in A shifted -> TEMP1
+       add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
+       add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+       addi            KK,     KK,     16                              // KK += Number of values in A
+#endif
+
+
+       addic.          I,      I,      -1
+       bgt             DTRMM_L1x16_BEGIN
+
+DTRMM_L1x16_END:
+
+DTRMM_L1x8_BEGIN:
+       andi.           T2,     M,      15
+       ble             DTRMM_L1x1_END
+
+       andi.           T1,     M,      8
+       ble             DTRMM_L1x8_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       mr              BO,     B                                       // B -> BO
+#else
+       mr              BO,     B                                       // B -> BO
+       slwi            T1,     KK,     3                               // Number of values in B shifted
+       slwi            T2,     KK,     6                               // Number of values in A shifted
+       add             BO,     BO,     T1                              // Add values to BO
+       add             AO,     AO,     T2                              // Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       sub             T1,     K,      KK                              // K - KK -> TEMP1
+#else
+       mr              T1,     KK                                      // KK -> KTEMP
+#ifdef LEFT
+       addi            T1,     T1,     8                               // KTEMP + Number of values in A -> KTEMP
+#else
+       addi            T1,     T1,     1                               // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+       mr              KKK,    T1
+       mr              K1,     T1
+       srawi.          L,      K1,     3                               // KTEMP / 8 -> L
+       ble             DTRMM_L1x8_SUB0
+       cmpwi           cr0,    L,      1
+       ble             DTRMM_L1x8_SUB4
+
+DTRMM_L1x8_LOOP_START:
+
+       LOAD1x8_1
+       KERNEL1x8_I1
+       KERNEL1x8_2
+       KERNEL1x8_1
+       KERNEL1x8_2
+
+       KERNEL1x8_1
+       KERNEL1x8_2
+       KERNEL1x8_1
+       KERNEL1x8_2
+
+       addic.          L,      L,      -2
+       ble             DTRMM_L1x8_LOOP_END
+
+       .align 5
+
+DTRMM_L1x8_LOOP:
+
+       KERNEL1x8_1
+       KERNEL1x8_2
+       KERNEL1x8_1
+       KERNEL1x8_2
+
+       KERNEL1x8_1
+       KERNEL1x8_2
+       KERNEL1x8_1
+       KERNEL1x8_2
+
+       addic.          L,      L,      -1
+       bgt             DTRMM_L1x8_LOOP
+
+DTRMM_L1x8_LOOP_END:
+
+       KERNEL1x8_1
+       KERNEL1x8_2
+       KERNEL1x8_1
+       KERNEL1x8_2
+
+       KERNEL1x8_1
+       KERNEL1x8_2
+       KERNEL1x8_1
+       KERNEL1x8_E2
+
+       b               DTRMM_L1x8_SUB1
+
+DTRMM_L1x8_SUB4:
+
+       KERNEL1x8_SUBI1
+       KERNEL1x8_SUB1
+       KERNEL1x8_SUB1
+       KERNEL1x8_SUB1
+
+       KERNEL1x8_SUB1
+       KERNEL1x8_SUB1
+       KERNEL1x8_SUB1
+       KERNEL1x8_SUB1
+
+       b               DTRMM_L1x8_SUB1
+
+DTRMM_L1x8_SUB0:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+
+       KERNEL1x8_SUBI1
+
+       addic.          L,      L,      -1
+       ble             DTRMM_L1x8_SAVE
+       b               DTRMM_L1x8_SUB2
+
+DTRMM_L1x8_SUB1:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+       ble             DTRMM_L1x8_SAVE
+
+DTRMM_L1x8_SUB2:
+
+       KERNEL1x8_SUB1
+
+       addic.          L,      L,      -1
+       bgt             DTRMM_L1x8_SUB2
+
+DTRMM_L1x8_SAVE:
+
+       SAVE1x8
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
+       slwi            T2,     T1,     3                       // TEMP1 * Number of values in B shifted -> TEMP2
+       slwi            T1,     T1,     6                       // TEMP1 * Number of values in A shifted -> TEMP1
+       add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
+       add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+       addi            KK,     KK,     8                               // KK += Number of values in A
+#endif
+
+
+DTRMM_L1x8_END:
+
+DTRMM_L1x4_BEGIN:
+
+       andi.           T1,     M,      4
+       ble             DTRMM_L1x4_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       mr              BO,     B                                       // B -> BO
+#else
+       mr              BO,     B                                       // B -> BO
+       slwi            T1,     KK,     3                               // Number of values in B shifted
+       slwi            T2,     KK,     5                               // Number of values in A shifted
+       add             BO,     BO,     T1                              // Add values to BO
+       add             AO,     AO,     T2                              // Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       sub             T1,     K,      KK                              // K - KK -> TEMP1
+#else
+       mr              T1,     KK                                      // KK -> KTEMP
+#ifdef LEFT
+       addi            T1,     T1,     4                               // KTEMP + Number of values in A -> KTEMP
+#else
+       addi            T1,     T1,     1                               // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+       mr              KKK,    T1
+       mr              K1,     T1
+       srawi.          L,      K1,     3                               // KTEMP / 8 -> L
+       ble             DTRMM_L1x4_SUB0
+       cmpwi           cr0,    L,      1
+       ble             DTRMM_L1x4_SUB4
+
+DTRMM_L1x4_LOOP_START:
+
+       LOAD1x4_1
+       KERNEL1x4_I1
+       KERNEL1x4_2
+       KERNEL1x4_1
+       KERNEL1x4_2
+
+       KERNEL1x4_1
+       KERNEL1x4_2
+       KERNEL1x4_1
+       KERNEL1x4_2
+
+       addic.          L,      L,      -2
+       ble             DTRMM_L1x4_LOOP_END
+
+       .align 5
+
+DTRMM_L1x4_LOOP:
+
+       KERNEL1x4_1
+       KERNEL1x4_2
+       KERNEL1x4_1
+       KERNEL1x4_2
+
+       KERNEL1x4_1
+       KERNEL1x4_2
+       KERNEL1x4_1
+       KERNEL1x4_2
+
+       addic.          L,      L,      -1
+       bgt             DTRMM_L1x4_LOOP
+
+DTRMM_L1x4_LOOP_END:
+
+       KERNEL1x4_1
+       KERNEL1x4_2
+       KERNEL1x4_1
+       KERNEL1x4_2
+
+       KERNEL1x4_1
+       KERNEL1x4_2
+       KERNEL1x4_1
+       KERNEL1x4_E2
+
+       b               DTRMM_L1x4_SUB1
+
+DTRMM_L1x4_SUB4:
+
+       KERNEL1x4_SUBI1
+       KERNEL1x4_SUB1
+       KERNEL1x4_SUB1
+       KERNEL1x4_SUB1
+
+       KERNEL1x4_SUB1
+       KERNEL1x4_SUB1
+       KERNEL1x4_SUB1
+       KERNEL1x4_SUB1
+
+       b               DTRMM_L1x4_SUB1
+
+DTRMM_L1x4_SUB0:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+
+       KERNEL1x4_SUBI1
+
+       addic.          L,      L,      -1
+       ble             DTRMM_L1x4_SAVE
+       b               DTRMM_L1x4_SUB2
+
+DTRMM_L1x4_SUB1:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+       ble             DTRMM_L1x4_SAVE
+
+DTRMM_L1x4_SUB2:
+
+       KERNEL1x4_SUB1
+
+       addic.          L,      L,      -1
+       bgt             DTRMM_L1x4_SUB2
+
+DTRMM_L1x4_SAVE:
+
+       SAVE1x4
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
+       slwi            T2,     T1,     3                       // TEMP1 * Number of values in B shifted -> TEMP2
+       slwi            T1,     T1,     5                       // TEMP1 * Number of values in A shifted -> TEMP1
+       add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
+       add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+       addi            KK,     KK,     4                               // KK += Number of values in A
+#endif
+
+
+DTRMM_L1x4_END:
+
+DTRMM_L1x2_BEGIN:
+
+       andi.           T1,     M,      2
+       ble             DTRMM_L1x2_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       mr              BO,     B                                       // B -> BO
+#else
+       mr              BO,     B                                       // B -> BO
+       slwi            T1,     KK,     3                               // Number of values in B shifted
+       slwi            T2,     KK,     4                               // Number of values in A shifted
+       add             BO,     BO,     T1                              // Add values to BO
+       add             AO,     AO,     T2                              // Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       sub             T1,     K,      KK                              // K - KK -> TEMP1
+#else
+       mr              T1,     KK                                      // KK -> KTEMP
+#ifdef LEFT
+       addi            T1,     T1,     2                               // KTEMP + Number of values in A -> KTEMP
+#else
+       addi            T1,     T1,     1                               // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+       mr              KKK,    T1
+       mr              K1,     T1
+       srawi.          L,      K1,     3                               // KTEMP / 8 -> L
+       ble             DTRMM_L1x2_SUB0
+       cmpwi           cr0,    L,      1
+       ble             DTRMM_L1x2_SUB4
+
+DTRMM_L1x2_LOOP_START:
+
+       LOAD1x2_1
+       KERNEL1x2_I1
+       KERNEL1x2_2
+       KERNEL1x2_1
+       KERNEL1x2_2
+
+       KERNEL1x2_1
+       KERNEL1x2_2
+       KERNEL1x2_1
+       KERNEL1x2_2
+
+       addic.          L,      L,      -2
+       ble             DTRMM_L1x2_LOOP_END
+
+       .align 5
+
+DTRMM_L1x2_LOOP:
+
+       KERNEL1x2_1
+       KERNEL1x2_2
+       KERNEL1x2_1
+       KERNEL1x2_2
+
+       KERNEL1x2_1
+       KERNEL1x2_2
+       KERNEL1x2_1
+       KERNEL1x2_2
+
+       addic.          L,      L,      -1
+       bgt             DTRMM_L1x2_LOOP
+
+DTRMM_L1x2_LOOP_END:
+
+       KERNEL1x2_1
+       KERNEL1x2_2
+       KERNEL1x2_1
+       KERNEL1x2_2
+
+       KERNEL1x2_1
+       KERNEL1x2_2
+       KERNEL1x2_1
+       KERNEL1x2_E2
+
+       b               DTRMM_L1x2_SUB1
+
+DTRMM_L1x2_SUB4:
+
+       KERNEL1x2_SUBI1
+       KERNEL1x2_SUB1
+       KERNEL1x2_SUB1
+       KERNEL1x2_SUB1
+
+       KERNEL1x2_SUB1
+       KERNEL1x2_SUB1
+       KERNEL1x2_SUB1
+       KERNEL1x2_SUB1
+
+       b               DTRMM_L1x2_SUB1
+
+DTRMM_L1x2_SUB0:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+
+       KERNEL1x2_SUBI1
+
+       addic.          L,      L,      -1
+       ble             DTRMM_L1x2_SAVE
+       b               DTRMM_L1x2_SUB2
+
+DTRMM_L1x2_SUB1:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+       ble             DTRMM_L1x2_SAVE
+
+DTRMM_L1x2_SUB2:
+
+       KERNEL1x2_SUB1
+
+       addic.          L,      L,      -1
+       bgt             DTRMM_L1x2_SUB2
+
+DTRMM_L1x2_SAVE:
+
+       SAVE1x2
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
+       slwi            T2,     T1,     3                       // TEMP1 * Number of values in B shifted -> TEMP2
+       slwi            T1,     T1,     4                       // TEMP1 * Number of values in A shifted -> TEMP1
+       add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
+       add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+       addi            KK,     KK,     2                               // KK += Number of values in A
+#endif
+
+
+DTRMM_L1x2_END:
+
+DTRMM_L1x1_BEGIN:
+
+       andi.           T1,     M,      1
+       ble             DTRMM_L1x1_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       mr              BO,     B                                       // B -> BO
+#else
+       mr              BO,     B                                       // B -> BO
+       slwi            T1,     KK,     3                               // Number of values in B shifted
+       slwi            T2,     KK,     3                               // Number of values in A shifted
+       add             BO,     BO,     T1                              // Add values to BO
+       add             AO,     AO,     T2                              // Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       sub             T1,     K,      KK                              // K - KK -> TEMP1
+#else
+       mr              T1,     KK                                      // KK -> KTEMP
+#ifdef LEFT
+       addi            T1,     T1,     1                               // KTEMP + Number of values in A -> KTEMP
+#else
+       addi            T1,     T1,     1                               // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+       mr              KKK,    T1
+       mr              K1,     T1
+       srawi.          L,      K1,     3                               // KTEMP / 8 -> L
+       ble             DTRMM_L1x1_SUB0
+       cmpwi           cr0,    L,      1
+       ble             DTRMM_L1x1_SUB4
+
+DTRMM_L1x1_LOOP_START:
+
+       LOAD1x1_1
+       KERNEL1x1_I1
+       KERNEL1x1_2
+       KERNEL1x1_1
+       KERNEL1x1_2
+
+       KERNEL1x1_1
+       KERNEL1x1_2
+       KERNEL1x1_1
+       KERNEL1x1_2
+
+       addic.          L,      L,      -2
+       ble             DTRMM_L1x1_LOOP_END
+
+       .align 5
+
+DTRMM_L1x1_LOOP:
+
+       KERNEL1x1_1
+       KERNEL1x1_2
+       KERNEL1x1_1
+       KERNEL1x1_2
+
+       KERNEL1x1_1
+       KERNEL1x1_2
+       KERNEL1x1_1
+       KERNEL1x1_2
+
+       addic.          L,      L,      -1
+       bgt             DTRMM_L1x1_LOOP
+
+DTRMM_L1x1_LOOP_END:
+
+       KERNEL1x1_1
+       KERNEL1x1_2
+       KERNEL1x1_1
+       KERNEL1x1_2
+
+       KERNEL1x1_1
+       KERNEL1x1_2
+       KERNEL1x1_1
+       KERNEL1x1_E2
+
+       b               DTRMM_L1x1_SUB1
+
+DTRMM_L1x1_SUB4:
+
+       KERNEL1x1_SUBI1
+       KERNEL1x1_SUB1
+       KERNEL1x1_SUB1
+       KERNEL1x1_SUB1
+
+       KERNEL1x1_SUB1
+       KERNEL1x1_SUB1
+       KERNEL1x1_SUB1
+       KERNEL1x1_SUB1
+
+       b               DTRMM_L1x1_SUB1
+
+DTRMM_L1x1_SUB0:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+
+       KERNEL1x1_SUBI1
+
+       addic.          L,      L,      -1
+       ble             DTRMM_L1x1_SAVE
+       b               DTRMM_L1x1_SUB2
+
+DTRMM_L1x1_SUB1:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+       ble             DTRMM_L1x1_SAVE
+
+DTRMM_L1x1_SUB2:
+
+       KERNEL1x1_SUB1
+
+       addic.          L,      L,      -1
+       bgt             DTRMM_L1x1_SUB2
+
+DTRMM_L1x1_SAVE:
+
+       SAVE1x1
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
+       slwi            T2,     T1,     3                       // TEMP1 * Number of values in B shifted -> TEMP2
+       slwi            T1,     T1,     3                       // TEMP1 * Number of values in A shifted -> TEMP1
+       add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
+       add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+       addi            KK,     KK,     1                               // KK += Number of values in A
+#endif
+
+
+DTRMM_L1x1_END:
+
+#if !defined(LEFT)
+       addi            KK,     KK,     1                                       // KK += Number of values in B
+#endif
+
+
+DTRMM_L1_END:
index d7cfe5e..c6e69b4 100644 (file)
 #define PREFETCHWSIZE  72
 #endif
 
-#ifdef POWER8
+#ifdef PPCG4
 #define PREFETCHSIZE   16
 #define PREFETCHWSIZE  72
 #endif
 
-#ifdef PPCG4
+#ifdef POWER8
 #define PREFETCHSIZE   16
 #define PREFETCHWSIZE  72
 #endif
@@ -198,7 +198,7 @@ LL(12):
        STFD    c12,  14 * SIZE(B)
        STFD    c16,  15 * SIZE(B)
 
-#ifdef POWER6
+#if defined(POWER6) || defined(POWER8)
        dcbtst  PREA, AO1
        dcbtst  PREA, AO2
        dcbtst  PREA, AO3
index 46b1cd9..3051344 100644 (file)
 #define PREFETCHWSIZE  48
 #endif
 
-#ifdef POWER8
+#ifdef PPCG4
 #define PREFETCHSIZE   16
 #define PREFETCHWSIZE  48
 #endif
 
-#ifdef PPCG4
+#ifdef POWER8
 #define PREFETCHSIZE   16
 #define PREFETCHWSIZE  48
 #endif
@@ -229,7 +229,7 @@ LL(12):
        STFD    c15,  14 * SIZE(B1)
        STFD    c16,  15 * SIZE(B1)
 
-#ifdef POWER6
+#if defined(POWER6) || defined(POWER8)
        dcbtst  PREA, AO1
        dcbtst  PREA, AO2
        dcbtst  PREA, AO3
index 5c46c43..77587ec 100644 (file)
 #define PREFETCHSIZE_C  40
 #endif
 
-#ifdef POWER8
-#define PREFETCHSIZE_A  96
-#define PREFETCHSIZE_C  40
-#endif
-
 #ifndef NEEDPARAM
 
 #ifndef __64BIT__
index 4577530..817a60b 100644 (file)
 #define PREFETCHSIZE_C   8
 #endif
 
-#ifdef POWER8
-#define PREFETCHSIZE_A  96
-#define PREFETCHSIZE_C   8
-#endif
-
 #define y01 f0
 #define y02 f1
 #define y03 f2
index 9f759c3..f7d768c 100644 (file)
 #define PREFETCHSIZE_A  40
 #endif
 
-#ifdef POWER8
-#define PREFETCHSIZE_A  40
-#endif
-
-#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970) || defined(POWER8)
+#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970)
 #define NOP1
 #define NOP2
 #else
index e4e419b..d8e0823 100644 (file)
 #define PREFETCHSIZE_A  40
 #endif
 
-#ifdef POWER8
-#define PREFETCHSIZE_A  40
-#endif
-
-#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970) || defined(POWER8)
+#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970)
 #define NOP1
 #define NOP2
 #else
diff --git a/kernel/power/zgemm_kernel_8x2_power8.S b/kernel/power/zgemm_kernel_8x2_power8.S
new file mode 100644 (file)
index 0000000..03957f4
--- /dev/null
@@ -0,0 +1,332 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+#ifndef __64BIT__
+#define LOAD   lwz
+#else
+#define LOAD   ld
+#endif
+
+#ifdef __64BIT__
+#define STACKSIZE 320
+#define ALPHA_R_SP 296(SP)
+#define ALPHA_I_SP 304(SP)
+#define FZERO  312(SP)
+#else
+#define STACKSIZE 256
+#define ALPHA_R_SP 224(SP)
+#define ALPHA_I_SP 232(SP)
+#define FZERO  240(SP)
+#endif
+
+#define        M       r3
+#define        N       r4
+#define        K       r5
+
+#ifdef linux
+#ifndef __64BIT__
+#define A      r6
+#define        B       r7
+#define        C       r8
+#define        LDC     r9
+#define OFFSET r10
+#else
+#define A      r8
+#define        B       r9
+#define        C       r10
+#define        LDC     r6
+#define OFFSET r7
+#endif
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#if !defined(__64BIT__) && defined(DOUBLE)
+#define A      r10
+#define        B       r6
+#define        C       r7
+#define        LDC     r8
+#define OFFSET r9
+#else
+#define A      r8
+#define        B       r9
+#define        C       r10
+#define        LDC     r6
+#define OFFSET r7
+#endif
+#endif
+
+#define o0     0
+#define alpha_r vs30
+#define alpha_i vs31
+
+#define L      r15
+#define ALPHA  r16
+#define o24    r17
+#define T2     r19
+#define KK     r20
+#define        o8      r21
+#define        I       r22
+#define J      r23
+#define AO     r24
+#define        BO      r25
+#define        CO      r26
+#define o16    r27
+#define        o32     r28
+#define o48    r29
+
+#define PRE    r30
+#define T1     r31
+
+#ifndef NEEDPARAM
+
+       PROLOGUE
+       PROFCODE
+
+       addi    SP, SP, -STACKSIZE
+       li      r0, 0
+
+       stfd    f14,    0(SP)
+       stfd    f15,    8(SP)
+       stfd    f16,   16(SP)
+       stfd    f17,   24(SP)
+
+       stfd    f18,   32(SP)
+       stfd    f19,   40(SP)
+       stfd    f20,   48(SP)
+       stfd    f21,   56(SP)
+
+       stfd    f22,   64(SP)
+       stfd    f23,   72(SP)
+       stfd    f24,   80(SP)
+       stfd    f25,   88(SP)
+
+       stfd    f26,   96(SP)
+       stfd    f27,  104(SP)
+       stfd    f28,  112(SP)
+       stfd    f29,  120(SP)
+
+       stfd    f30,  128(SP)
+       stfd    f31,  136(SP)
+
+#ifdef __64BIT__
+       std     r31,  144(SP)
+       std     r30,  152(SP)
+       std     r29,  160(SP)
+       std     r28,  168(SP)
+       std     r27,  176(SP)
+       std     r26,  184(SP)
+       std     r25,  192(SP)
+       std     r24,  200(SP)
+       std     r23,  208(SP)
+       std     r22,  216(SP)
+       std     r21,  224(SP)
+       std     r20,  232(SP)
+       std     r19,  240(SP)
+       std     r18,  248(SP)
+       std     r17,  256(SP)
+       std     r16,  264(SP)
+       std     r15,  272(SP)
+#else
+       stw     r31,  144(SP)
+       stw     r30,  148(SP)
+       stw     r29,  152(SP)
+       stw     r28,  156(SP)
+       stw     r27,  160(SP)
+       stw     r26,  164(SP)
+       stw     r25,  168(SP)
+       stw     r24,  172(SP)
+       stw     r23,  176(SP)
+       stw     r22,  180(SP)
+       stw     r21,  184(SP)
+       stw     r20,  188(SP)
+       stw     r19,  192(SP)
+       stw     r18,  196(SP)
+       stw     r17,  200(SP)
+       stw     r16,  204(SP)
+       stw     r15,  208(SP)
+#endif
+
+       stfd    f1,  ALPHA_R_SP
+       stfd    f2,  ALPHA_I_SP
+       stw     r0,  FZERO
+
+#ifdef linux
+#ifdef __64BIT__
+       ld      LDC, FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#ifdef __64BIT__
+       ld      LDC, FRAMESLOT(0) + STACKSIZE(SP)
+#else
+#ifdef DOUBLE
+       lwz     B,   FRAMESLOT(0) + STACKSIZE(SP)
+       lwz     C,   FRAMESLOT(1) + STACKSIZE(SP)
+       lwz     LDC, FRAMESLOT(2) + STACKSIZE(SP)
+#else
+       lwz     LDC, FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+#endif
+#endif
+
+#ifdef TRMMKERNEL
+#if defined(linux) && defined(__64BIT__)
+       ld      OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#ifdef __64BIT__
+       ld      OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
+#else
+#ifdef DOUBLE
+       lwz     OFFSET,  FRAMESLOT(3) + STACKSIZE(SP)
+#else
+       lwz     OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
+#endif
+#endif
+#endif
+#if defined(TRMMKERNEL) && !defined(LEFT)
+       neg     KK, OFFSET
+#endif
+#endif
+
+#include "zgemm_macros_8x2_power8.S"
+
+       cmpwi   cr0, M, 0
+       ble     L999
+       cmpwi   cr0, N, 0
+       ble     L999
+       cmpwi   cr0, K, 0
+       ble     L999
+
+       slwi    LDC, LDC, ZBASE_SHIFT
+       li      PRE, 256 
+       li      o8  , 8
+       li      o16 , 16
+       li      o24 , 24
+       li      o32 , 32
+       li      o48 , 48
+
+#ifdef __64BIT__
+       addi    ALPHA, SP, 296
+#else
+       addi    ALPHA, SP, 224
+#endif
+
+       lxvdsx  alpha_r, 0, ALPHA
+       lxvdsx  alpha_i, o8, ALPHA
+
+       .align 5
+
+#include "zgemm_logic_8x2_power8.S"
+
+L999:
+       addi    r3, 0, 0
+
+       lfd     f14,    0(SP)
+       lfd     f15,    8(SP)
+       lfd     f16,   16(SP)
+       lfd     f17,   24(SP)
+
+       lfd     f18,   32(SP)
+       lfd     f19,   40(SP)
+       lfd     f20,   48(SP)
+       lfd     f21,   56(SP)
+
+       lfd     f22,   64(SP)
+       lfd     f23,   72(SP)
+       lfd     f24,   80(SP)
+       lfd     f25,   88(SP)
+
+       lfd     f26,   96(SP)
+       lfd     f27,  104(SP)
+       lfd     f28,  112(SP)
+       lfd     f29,  120(SP)
+
+       lfd     f30,  128(SP)
+       lfd     f31,  136(SP)
+
+#ifdef __64BIT__
+       ld      r31,  144(SP)
+       ld      r30,  152(SP)
+       ld      r29,  160(SP)
+       ld      r28,  168(SP)
+       ld      r27,  176(SP)
+       ld      r26,  184(SP)
+       ld      r25,  192(SP)
+       ld      r24,  200(SP)
+       ld      r23,  208(SP)
+       ld      r22,  216(SP)
+       ld      r21,  224(SP)
+       ld      r20,  232(SP)
+       ld      r19,  240(SP)
+       ld      r18,  248(SP)
+       ld      r17,  256(SP)
+       ld      r16,  264(SP)
+       ld      r15,  272(SP)
+#else
+       lwz     r31,  144(SP)
+       lwz     r30,  148(SP)
+       lwz     r29,  152(SP)
+       lwz     r28,  156(SP)
+       lwz     r27,  160(SP)
+       lwz     r26,  164(SP)
+       lwz     r25,  168(SP)
+       lwz     r24,  172(SP)
+       lwz     r23,  176(SP)
+       lwz     r22,  180(SP)
+       lwz     r21,  184(SP)
+       lwz     r20,  188(SP)
+       lwz     r19,  192(SP)
+       lwz     r18,  196(SP)
+       lwz     r17,  200(SP)
+       lwz     r16,  204(SP)
+       lwz     r15,  208(SP)
+#endif
+
+       addi    SP, SP, STACKSIZE
+
+       blr
+
+       EPILOGUE
+#endif
diff --git a/kernel/power/zgemm_logic_8x2_power8.S b/kernel/power/zgemm_logic_8x2_power8.S
new file mode 100644 (file)
index 0000000..e829fd6
--- /dev/null
@@ -0,0 +1,901 @@
+       srawi.          J,      N,      1
+       ble             ZGEMM_L2_END
+
+ZGEMM_L2_BEGIN:
+
+       mr              CO,     C
+       mr              AO,     A
+       slwi            T1,     LDC     ,       1
+       add             C,      C,      T1
+       srawi.          I,      M,      3
+       ble             ZGEMM_L2x8_END
+
+ZGEMM_L2x8_BEGIN:
+
+
+       mr              BO,     B
+       srawi.          L,      K,      3
+       ble             ZGEMM_L2x8_SUB0
+       cmpwi           cr0,    L,      1
+       ble             ZGEMM_L2x8_SUB4
+
+ZGEMM_L2x8_LOOP_START:
+
+       dcbt            AO,     PRE
+       LOAD2x8_1
+       dcbt            AO,     PRE
+       KERNEL2x8_I1
+       dcbt            AO,     PRE
+       KERNEL2x8_2
+       dcbt            AO,     PRE
+       KERNEL2x8_1
+       dcbt            AO,     PRE
+       KERNEL2x8_2
+
+       dcbt            AO,     PRE
+       KERNEL2x8_1
+       dcbt            AO,     PRE
+       KERNEL2x8_2
+       dcbt            AO,     PRE
+       KERNEL2x8_1
+       dcbt            AO,     PRE
+       KERNEL2x8_2
+
+       addic.          L,      L,      -2
+       ble             ZGEMM_L2x8_LOOP_END
+
+       .align 5
+
+ZGEMM_L2x8_LOOP:
+
+       dcbt            AO,     PRE
+       KERNEL2x8_1
+       dcbt            AO,     PRE
+       KERNEL2x8_2
+       dcbt            AO,     PRE
+       KERNEL2x8_1
+       dcbt            AO,     PRE
+       KERNEL2x8_2
+
+       dcbt            AO,     PRE
+       KERNEL2x8_1
+       dcbt            AO,     PRE
+       KERNEL2x8_2
+       dcbt            AO,     PRE
+       KERNEL2x8_1
+       dcbt            AO,     PRE
+       KERNEL2x8_2
+
+       addic.          L,      L,      -1
+       bgt             ZGEMM_L2x8_LOOP
+
+ZGEMM_L2x8_LOOP_END:
+
+       dcbt            AO,     PRE
+       KERNEL2x8_1
+       dcbt            AO,     PRE
+       KERNEL2x8_2
+       dcbt            AO,     PRE
+       KERNEL2x8_1
+       dcbt            AO,     PRE
+       KERNEL2x8_2
+
+       dcbt            AO,     PRE
+       KERNEL2x8_1
+       dcbt            AO,     PRE
+       KERNEL2x8_2
+       dcbt            AO,     PRE
+       KERNEL2x8_1
+       KERNEL2x8_E2
+
+       b               ZGEMM_L2x8_SUB1
+
+ZGEMM_L2x8_SUB4:
+
+       dcbt            AO,     PRE
+       KERNEL2x8_SUBI1
+       dcbt            AO,     PRE
+       KERNEL2x8_SUB1
+       dcbt            AO,     PRE
+       KERNEL2x8_SUB1
+       dcbt            AO,     PRE
+       KERNEL2x8_SUB1
+
+       KERNEL2x8_SUB1
+       KERNEL2x8_SUB1
+       KERNEL2x8_SUB1
+       KERNEL2x8_SUB1
+
+       b               ZGEMM_L2x8_SUB1
+
+ZGEMM_L2x8_SUB0:
+
+       andi.           L,      K,      7
+
+       KERNEL2x8_SUBI1
+
+       addic.          L,      L,      -1
+       ble             ZGEMM_L2x8_SAVE
+       b               ZGEMM_L2x8_SUB2
+
+ZGEMM_L2x8_SUB1:
+
+       andi.           L,      K,      7
+       ble             ZGEMM_L2x8_SAVE
+
+ZGEMM_L2x8_SUB2:
+
+       KERNEL2x8_SUB1
+
+       addic.          L,      L,      -1
+       bgt             ZGEMM_L2x8_SUB2
+
+ZGEMM_L2x8_SAVE:
+
+       SAVE2x8
+
+       addic.          I,      I,      -1
+       bgt             ZGEMM_L2x8_BEGIN
+
+ZGEMM_L2x8_END:
+
+ZGEMM_L2x4_BEGIN:
+
+       andi.           T2,     M,      7
+       ble             ZGEMM_L2x1_END
+
+       andi.           T1,     M,      4
+       ble             ZGEMM_L2x4_END
+       mr              BO,     B
+       srawi.          L,      K,      3
+       ble             ZGEMM_L2x4_SUB0
+       cmpwi           cr0,    L,      1
+       ble             ZGEMM_L2x4_SUB4
+
+ZGEMM_L2x4_LOOP_START:
+
+       LOAD2x4_1
+       KERNEL2x4_I1
+       KERNEL2x4_2
+       KERNEL2x4_1
+       KERNEL2x4_2
+
+       KERNEL2x4_1
+       KERNEL2x4_2
+       KERNEL2x4_1
+       KERNEL2x4_2
+
+       addic.          L,      L,      -2
+       ble             ZGEMM_L2x4_LOOP_END
+
+       .align 5
+
+ZGEMM_L2x4_LOOP:
+
+       KERNEL2x4_1
+       KERNEL2x4_2
+       KERNEL2x4_1
+       KERNEL2x4_2
+
+       KERNEL2x4_1
+       KERNEL2x4_2
+       KERNEL2x4_1
+       KERNEL2x4_2
+
+       addic.          L,      L,      -1
+       bgt             ZGEMM_L2x4_LOOP
+
+ZGEMM_L2x4_LOOP_END:
+
+       KERNEL2x4_1
+       KERNEL2x4_2
+       KERNEL2x4_1
+       KERNEL2x4_2
+
+       KERNEL2x4_1
+       KERNEL2x4_2
+       KERNEL2x4_1
+       KERNEL2x4_E2
+
+       b               ZGEMM_L2x4_SUB1
+
+ZGEMM_L2x4_SUB4:
+
+       KERNEL2x4_SUBI1
+       KERNEL2x4_SUB1
+       KERNEL2x4_SUB1
+       KERNEL2x4_SUB1
+
+       KERNEL2x4_SUB1
+       KERNEL2x4_SUB1
+       KERNEL2x4_SUB1
+       KERNEL2x4_SUB1
+
+       b               ZGEMM_L2x4_SUB1
+
+ZGEMM_L2x4_SUB0:
+
+       andi.           L,      K,      7
+
+       KERNEL2x4_SUBI1
+
+       addic.          L,      L,      -1
+       ble             ZGEMM_L2x4_SAVE
+       b               ZGEMM_L2x4_SUB2
+
+ZGEMM_L2x4_SUB1:
+
+       andi.           L,      K,      7
+       ble             ZGEMM_L2x4_SAVE
+
+ZGEMM_L2x4_SUB2:
+
+       KERNEL2x4_SUB1
+
+       addic.          L,      L,      -1
+       bgt             ZGEMM_L2x4_SUB2
+
+ZGEMM_L2x4_SAVE:
+
+       SAVE2x4
+
+ZGEMM_L2x4_END:
+
+ZGEMM_L2x2_BEGIN:
+
+
+       andi.           T1,     M,      2
+       ble             ZGEMM_L2x2_END
+       mr              BO,     B
+       srawi.          L,      K,      3
+       ble             ZGEMM_L2x2_SUB0
+       cmpwi           cr0,    L,      1
+       ble             ZGEMM_L2x2_SUB4
+
+ZGEMM_L2x2_LOOP_START:
+
+       LOAD2x2_1
+       KERNEL2x2_I1
+       KERNEL2x2_2
+       KERNEL2x2_1
+       KERNEL2x2_2
+
+       KERNEL2x2_1
+       KERNEL2x2_2
+       KERNEL2x2_1
+       KERNEL2x2_2
+
+       addic.          L,      L,      -2
+       ble             ZGEMM_L2x2_LOOP_END
+
+       .align 5
+
+ZGEMM_L2x2_LOOP:
+
+       KERNEL2x2_1
+       KERNEL2x2_2
+       KERNEL2x2_1
+       KERNEL2x2_2
+
+       KERNEL2x2_1
+       KERNEL2x2_2
+       KERNEL2x2_1
+       KERNEL2x2_2
+
+       addic.          L,      L,      -1
+       bgt             ZGEMM_L2x2_LOOP
+
+ZGEMM_L2x2_LOOP_END:
+
+       KERNEL2x2_1
+       KERNEL2x2_2
+       KERNEL2x2_1
+       KERNEL2x2_2
+
+       KERNEL2x2_1
+       KERNEL2x2_2
+       KERNEL2x2_1
+       KERNEL2x2_E2
+
+       b               ZGEMM_L2x2_SUB1
+
+ZGEMM_L2x2_SUB4:
+
+       KERNEL2x2_SUBI1
+       KERNEL2x2_SUB1
+       KERNEL2x2_SUB1
+       KERNEL2x2_SUB1
+
+       KERNEL2x2_SUB1
+       KERNEL2x2_SUB1
+       KERNEL2x2_SUB1
+       KERNEL2x2_SUB1
+
+       b               ZGEMM_L2x2_SUB1
+
+ZGEMM_L2x2_SUB0:
+
+       andi.           L,      K,      7
+
+       KERNEL2x2_SUBI1
+
+       addic.          L,      L,      -1
+       ble             ZGEMM_L2x2_SAVE
+       b               ZGEMM_L2x2_SUB2
+
+ZGEMM_L2x2_SUB1:
+
+       andi.           L,      K,      7
+       ble             ZGEMM_L2x2_SAVE
+
+ZGEMM_L2x2_SUB2:
+
+       KERNEL2x2_SUB1
+
+       addic.          L,      L,      -1
+       bgt             ZGEMM_L2x2_SUB2
+
+ZGEMM_L2x2_SAVE:
+
+       SAVE2x2
+
+ZGEMM_L2x2_END:
+
+ZGEMM_L2x1_BEGIN:
+
+
+       andi.           T1,     M,      1
+       ble             ZGEMM_L2x1_END
+       mr              BO,     B
+       srawi.          L,      K,      3
+       ble             ZGEMM_L2x1_SUB0
+       cmpwi           cr0,    L,      1
+       ble             ZGEMM_L2x1_SUB4
+
+ZGEMM_L2x1_LOOP_START:
+
+       LOAD2x1_1
+       KERNEL2x1_I1
+       KERNEL2x1_2
+       KERNEL2x1_1
+       KERNEL2x1_2
+
+       KERNEL2x1_1
+       KERNEL2x1_2
+       KERNEL2x1_1
+       KERNEL2x1_2
+
+       addic.          L,      L,      -2
+       ble             ZGEMM_L2x1_LOOP_END
+
+       .align 5
+
+ZGEMM_L2x1_LOOP:
+
+       KERNEL2x1_1
+       KERNEL2x1_2
+       KERNEL2x1_1
+       KERNEL2x1_2
+
+       KERNEL2x1_1
+       KERNEL2x1_2
+       KERNEL2x1_1
+       KERNEL2x1_2
+
+       addic.          L,      L,      -1
+       bgt             ZGEMM_L2x1_LOOP
+
+ZGEMM_L2x1_LOOP_END:
+
+       KERNEL2x1_1
+       KERNEL2x1_2
+       KERNEL2x1_1
+       KERNEL2x1_2
+
+       KERNEL2x1_1
+       KERNEL2x1_2
+       KERNEL2x1_1
+       KERNEL2x1_E2
+
+       b               ZGEMM_L2x1_SUB1
+
+ZGEMM_L2x1_SUB4:
+
+       KERNEL2x1_SUBI1
+       KERNEL2x1_SUB1
+       KERNEL2x1_SUB1
+       KERNEL2x1_SUB1
+
+       KERNEL2x1_SUB1
+       KERNEL2x1_SUB1
+       KERNEL2x1_SUB1
+       KERNEL2x1_SUB1
+
+       b               ZGEMM_L2x1_SUB1
+
+ZGEMM_L2x1_SUB0:
+
+       andi.           L,      K,      7
+
+       KERNEL2x1_SUBI1
+
+       addic.          L,      L,      -1
+       ble             ZGEMM_L2x1_SAVE
+       b               ZGEMM_L2x1_SUB2
+
+ZGEMM_L2x1_SUB1:
+
+       andi.           L,      K,      7
+       ble             ZGEMM_L2x1_SAVE
+
+ZGEMM_L2x1_SUB2:
+
+       KERNEL2x1_SUB1
+
+       addic.          L,      L,      -1
+       bgt             ZGEMM_L2x1_SUB2
+
+ZGEMM_L2x1_SAVE:
+
+       SAVE2x1
+
+ZGEMM_L2x1_END:
+
+       slwi            T1,     K,      5
+       add             B,      B,      T1
+
+       addic.          J,      J,      -1
+       bgt             ZGEMM_L2_BEGIN
+
+       andi.           T2,     N,      1
+       ble             L999
+
+ZGEMM_L2_END:
+
+       b               ZGEMM_L1_BEGIN
+
+L999_H1:
+
+       b               L999
+
+ZGEMM_L1_BEGIN:
+
+       andi.           T1,     N,      1
+       ble             ZGEMM_L1_END
+       mr              CO,     C
+       mr              AO,     A
+       srawi.          I,      M,      3
+       ble             ZGEMM_L1x8_END
+
+ZGEMM_L1x8_BEGIN:
+
+
+       mr              BO,     B
+       srawi.          L,      K,      3
+       ble             ZGEMM_L1x8_SUB0
+       cmpwi           cr0,    L,      1
+       ble             ZGEMM_L1x8_SUB4
+
+ZGEMM_L1x8_LOOP_START:
+
+       dcbt            AO,     PRE
+       LOAD1x8_1
+       dcbt            AO,     PRE
+       KERNEL1x8_I1
+       dcbt            AO,     PRE
+       KERNEL1x8_2
+       dcbt            AO,     PRE
+       KERNEL1x8_1
+       dcbt            AO,     PRE
+       KERNEL1x8_2
+
+       dcbt            AO,     PRE
+       KERNEL1x8_1
+       dcbt            AO,     PRE
+       KERNEL1x8_2
+       dcbt            AO,     PRE
+       KERNEL1x8_1
+       dcbt            AO,     PRE
+       KERNEL1x8_2
+
+       addic.          L,      L,      -2
+       ble             ZGEMM_L1x8_LOOP_END
+
+       .align 5
+
+ZGEMM_L1x8_LOOP:
+
+       dcbt            AO,     PRE
+       KERNEL1x8_1
+       dcbt            AO,     PRE
+       KERNEL1x8_2
+       dcbt            AO,     PRE
+       KERNEL1x8_1
+       dcbt            AO,     PRE
+       KERNEL1x8_2
+
+       dcbt            AO,     PRE
+       KERNEL1x8_1
+       dcbt            AO,     PRE
+       KERNEL1x8_2
+       dcbt            AO,     PRE
+       KERNEL1x8_1
+       dcbt            AO,     PRE
+       KERNEL1x8_2
+
+       addic.          L,      L,      -1
+       bgt             ZGEMM_L1x8_LOOP
+
+ZGEMM_L1x8_LOOP_END:
+
+       dcbt            AO,     PRE
+       KERNEL1x8_1
+       dcbt            AO,     PRE
+       KERNEL1x8_2
+       dcbt            AO,     PRE
+       KERNEL1x8_1
+       dcbt            AO,     PRE
+       KERNEL1x8_2
+
+       dcbt            AO,     PRE
+       KERNEL1x8_1
+       dcbt            AO,     PRE
+       KERNEL1x8_2
+       dcbt            AO,     PRE
+       KERNEL1x8_1
+       KERNEL1x8_E2
+
+       b               ZGEMM_L1x8_SUB1
+
+ZGEMM_L1x8_SUB4:
+
+       dcbt            AO,     PRE
+       KERNEL1x8_SUBI1
+       dcbt            AO,     PRE
+       KERNEL1x8_SUB1
+       dcbt            AO,     PRE
+       KERNEL1x8_SUB1
+       dcbt            AO,     PRE
+       KERNEL1x8_SUB1
+
+       KERNEL1x8_SUB1
+       KERNEL1x8_SUB1
+       KERNEL1x8_SUB1
+       KERNEL1x8_SUB1
+
+       b               ZGEMM_L1x8_SUB1
+
+ZGEMM_L1x8_SUB0:
+
+       andi.           L,      K,      7
+
+       KERNEL1x8_SUBI1
+
+       addic.          L,      L,      -1
+       ble             ZGEMM_L1x8_SAVE
+       b               ZGEMM_L1x8_SUB2
+
+ZGEMM_L1x8_SUB1:
+
+       andi.           L,      K,      7
+       ble             ZGEMM_L1x8_SAVE
+
+ZGEMM_L1x8_SUB2:
+
+       KERNEL1x8_SUB1
+
+       addic.          L,      L,      -1
+       bgt             ZGEMM_L1x8_SUB2
+
+ZGEMM_L1x8_SAVE:
+
+       SAVE1x8
+
+       addic.          I,      I,      -1
+       bgt             ZGEMM_L1x8_BEGIN
+
+ZGEMM_L1x8_END:
+
+ZGEMM_L1x4_BEGIN:
+
+       andi.           T2,     M,      7
+       ble             ZGEMM_L1x1_END
+
+       andi.           T1,     M,      4
+       ble             ZGEMM_L1x4_END
+       mr              BO,     B
+       srawi.          L,      K,      3
+       ble             ZGEMM_L1x4_SUB0
+       cmpwi           cr0,    L,      1
+       ble             ZGEMM_L1x4_SUB4
+
+ZGEMM_L1x4_LOOP_START:
+
+       LOAD1x4_1
+       KERNEL1x4_I1
+       KERNEL1x4_2
+       KERNEL1x4_1
+       KERNEL1x4_2
+
+       KERNEL1x4_1
+       KERNEL1x4_2
+       KERNEL1x4_1
+       KERNEL1x4_2
+
+       addic.          L,      L,      -2
+       ble             ZGEMM_L1x4_LOOP_END
+
+       .align 5
+
+ZGEMM_L1x4_LOOP:
+
+       KERNEL1x4_1
+       KERNEL1x4_2
+       KERNEL1x4_1
+       KERNEL1x4_2
+
+       KERNEL1x4_1
+       KERNEL1x4_2
+       KERNEL1x4_1
+       KERNEL1x4_2
+
+       addic.          L,      L,      -1
+       bgt             ZGEMM_L1x4_LOOP
+
+ZGEMM_L1x4_LOOP_END:
+
+       KERNEL1x4_1
+       KERNEL1x4_2
+       KERNEL1x4_1
+       KERNEL1x4_2
+
+       KERNEL1x4_1
+       KERNEL1x4_2
+       KERNEL1x4_1
+       KERNEL1x4_E2
+
+       b               ZGEMM_L1x4_SUB1
+
+ZGEMM_L1x4_SUB4:
+
+       KERNEL1x4_SUBI1
+       KERNEL1x4_SUB1
+       KERNEL1x4_SUB1
+       KERNEL1x4_SUB1
+
+       KERNEL1x4_SUB1
+       KERNEL1x4_SUB1
+       KERNEL1x4_SUB1
+       KERNEL1x4_SUB1
+
+       b               ZGEMM_L1x4_SUB1
+
+ZGEMM_L1x4_SUB0:
+
+       andi.           L,      K,      7
+
+       KERNEL1x4_SUBI1
+
+       addic.          L,      L,      -1
+       ble             ZGEMM_L1x4_SAVE
+       b               ZGEMM_L1x4_SUB2
+
+ZGEMM_L1x4_SUB1:
+
+       andi.           L,      K,      7
+       ble             ZGEMM_L1x4_SAVE
+
+ZGEMM_L1x4_SUB2:
+
+       KERNEL1x4_SUB1
+
+       addic.          L,      L,      -1
+       bgt             ZGEMM_L1x4_SUB2
+
+ZGEMM_L1x4_SAVE:
+
+       SAVE1x4
+
+ZGEMM_L1x4_END:
+
+ZGEMM_L1x2_BEGIN:
+
+
+       andi.           T1,     M,      2
+       ble             ZGEMM_L1x2_END
+       mr              BO,     B
+       srawi.          L,      K,      3
+       ble             ZGEMM_L1x2_SUB0
+       cmpwi           cr0,    L,      1
+       ble             ZGEMM_L1x2_SUB4
+
+ZGEMM_L1x2_LOOP_START:
+
+       LOAD1x2_1
+       KERNEL1x2_I1
+       KERNEL1x2_2
+       KERNEL1x2_1
+       KERNEL1x2_2
+
+       KERNEL1x2_1
+       KERNEL1x2_2
+       KERNEL1x2_1
+       KERNEL1x2_2
+
+       addic.          L,      L,      -2
+       ble             ZGEMM_L1x2_LOOP_END
+
+       .align 5
+
+ZGEMM_L1x2_LOOP:
+
+       KERNEL1x2_1
+       KERNEL1x2_2
+       KERNEL1x2_1
+       KERNEL1x2_2
+
+       KERNEL1x2_1
+       KERNEL1x2_2
+       KERNEL1x2_1
+       KERNEL1x2_2
+
+       addic.          L,      L,      -1
+       bgt             ZGEMM_L1x2_LOOP
+
+ZGEMM_L1x2_LOOP_END:
+
+       KERNEL1x2_1
+       KERNEL1x2_2
+       KERNEL1x2_1
+       KERNEL1x2_2
+
+       KERNEL1x2_1
+       KERNEL1x2_2
+       KERNEL1x2_1
+       KERNEL1x2_E2
+
+       b               ZGEMM_L1x2_SUB1
+
+ZGEMM_L1x2_SUB4:
+
+       KERNEL1x2_SUBI1
+       KERNEL1x2_SUB1
+       KERNEL1x2_SUB1
+       KERNEL1x2_SUB1
+
+       KERNEL1x2_SUB1
+       KERNEL1x2_SUB1
+       KERNEL1x2_SUB1
+       KERNEL1x2_SUB1
+
+       b               ZGEMM_L1x2_SUB1
+
+ZGEMM_L1x2_SUB0:
+
+       andi.           L,      K,      7
+
+       KERNEL1x2_SUBI1
+
+       addic.          L,      L,      -1
+       ble             ZGEMM_L1x2_SAVE
+       b               ZGEMM_L1x2_SUB2
+
+ZGEMM_L1x2_SUB1:
+
+       andi.           L,      K,      7
+       ble             ZGEMM_L1x2_SAVE
+
+ZGEMM_L1x2_SUB2:
+
+       KERNEL1x2_SUB1
+
+       addic.          L,      L,      -1
+       bgt             ZGEMM_L1x2_SUB2
+
+ZGEMM_L1x2_SAVE:
+
+       SAVE1x2
+
+ZGEMM_L1x2_END:
+
+ZGEMM_L1x1_BEGIN:
+
+
+       andi.           T1,     M,      1
+       ble             ZGEMM_L1x1_END
+       mr              BO,     B
+       srawi.          L,      K,      3
+       ble             ZGEMM_L1x1_SUB0
+       cmpwi           cr0,    L,      1
+       ble             ZGEMM_L1x1_SUB4
+
+ZGEMM_L1x1_LOOP_START:
+
+       LOAD1x1_1
+       KERNEL1x1_I1
+       KERNEL1x1_2
+       KERNEL1x1_1
+       KERNEL1x1_2
+
+       KERNEL1x1_1
+       KERNEL1x1_2
+       KERNEL1x1_1
+       KERNEL1x1_2
+
+       addic.          L,      L,      -2
+       ble             ZGEMM_L1x1_LOOP_END
+
+       .align 5
+
+ZGEMM_L1x1_LOOP:
+
+       KERNEL1x1_1
+       KERNEL1x1_2
+       KERNEL1x1_1
+       KERNEL1x1_2
+
+       KERNEL1x1_1
+       KERNEL1x1_2
+       KERNEL1x1_1
+       KERNEL1x1_2
+
+       addic.          L,      L,      -1
+       bgt             ZGEMM_L1x1_LOOP
+
+ZGEMM_L1x1_LOOP_END:
+
+       KERNEL1x1_1
+       KERNEL1x1_2
+       KERNEL1x1_1
+       KERNEL1x1_2
+
+       KERNEL1x1_1
+       KERNEL1x1_2
+       KERNEL1x1_1
+       KERNEL1x1_E2
+
+       b               ZGEMM_L1x1_SUB1
+
+ZGEMM_L1x1_SUB4:
+
+       KERNEL1x1_SUBI1
+       KERNEL1x1_SUB1
+       KERNEL1x1_SUB1
+       KERNEL1x1_SUB1
+
+       KERNEL1x1_SUB1
+       KERNEL1x1_SUB1
+       KERNEL1x1_SUB1
+       KERNEL1x1_SUB1
+
+       b               ZGEMM_L1x1_SUB1
+
+ZGEMM_L1x1_SUB0:
+
+       andi.           L,      K,      7
+
+       KERNEL1x1_SUBI1
+
+       addic.          L,      L,      -1
+       ble             ZGEMM_L1x1_SAVE
+       b               ZGEMM_L1x1_SUB2
+
+ZGEMM_L1x1_SUB1:
+
+       andi.           L,      K,      7
+       ble             ZGEMM_L1x1_SAVE
+
+ZGEMM_L1x1_SUB2:
+
+       KERNEL1x1_SUB1
+
+       addic.          L,      L,      -1
+       bgt             ZGEMM_L1x1_SUB2
+
+ZGEMM_L1x1_SAVE:
+
+       SAVE1x1
+
+ZGEMM_L1x1_END:
+
+ZGEMM_L1_END:
diff --git a/kernel/power/zgemm_macros_8x2_power8.S b/kernel/power/zgemm_macros_8x2_power8.S
new file mode 100644 (file)
index 0000000..3e5ea9c
--- /dev/null
@@ -0,0 +1,3074 @@
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+
+       #define XSFADD_R1       xsadddp
+       #define XSFADD_R2       xssubdp
+       #define XSFADD_I1       xsadddp
+       #define XSFADD_I2       xsadddp
+
+#elif  defined(CN) || defined(CT) || defined(RN) || defined(RT)
+
+       #define XSFADD_R1       xsadddp
+       #define XSFADD_R2       xsadddp
+       #define XSFADD_I1       xssubdp
+       #define XSFADD_I2       xsadddp
+
+#elif  defined(NC) || defined(TC) || defined(NR) || defined(TR)
+
+       #define XSFADD_R1       xsadddp
+       #define XSFADD_R2       xsadddp
+       #define XSFADD_I1       xsadddp
+       #define XSFADD_I2       xssubdp
+
+#else          // CC || CR || RC || RR
+
+       #define XSFADD_R1       xsadddp
+       #define XSFADD_R2       xssubdp
+       #define XSFADD_I1       xssubdp
+       #define XSFADD_I2       xssubdp
+
+#endif
+
+/**********************************************************************************************
+* Macros for N=2 and M=8
+**********************************************************************************************/
+
+.macro LOAD2x8_1
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+       lxvdsx          vs18,   o16,    BO              // load real part from B
+       lxvdsx          vs19,   o24,    BO              // load imag part from B
+
+       addi            BO,     BO,     32
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+       lxvd2x          vs1,    o16,    AO              // load real,imag from A
+       lxvd2x          vs2,    o32,    AO              // load real,imag from A
+       lxvd2x          vs3,    o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvd2x          vs4,    o0,     AO              // load real,imag from A
+       lxvd2x          vs5,    o16,    AO              // load real,imag from A
+       lxvd2x          vs6,    o32,    AO              // load real,imag from A
+       lxvd2x          vs7,    o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+
+.endm
+
+.macro KERNEL2x8_I1
+
+       lxvd2x          vs8,    o0,     AO              // load real,imag from A
+       lxvd2x          vs9,    o16,    AO              // load real,imag from A
+       lxvd2x          vs10,   o32,    AO              // load real,imag from A
+       lxvd2x          vs11,   o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvd2x          vs12,   o0,     AO              // load real,imag from A
+       lxvd2x          vs13,   o16,    AO              // load real,imag from A
+       lxvd2x          vs14,   o32,    AO              // load real,imag from A
+       lxvd2x          vs15,   o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvdsx          vs20,   o0,     BO              // load real part from B
+       lxvdsx          vs21,   o8,     BO              // load imag part from B
+       lxvdsx          vs22,   o16,    BO              // load real part from B
+       lxvdsx          vs23,   o24,    BO              // load imag part from B
+
+       addi            BO,     BO,     32
+
+       xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
+       xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
+       xvmuldp         vs34,   vs1,    vs16            // real*real, imag*real
+       xvmuldp         vs35,   vs1,    vs17            // real*imag, imag*imag
+       xvmuldp         vs36,   vs2,    vs16            // real*real, imag*real
+       xvmuldp         vs37,   vs2,    vs17            // real*imag, imag*imag
+       xvmuldp         vs38,   vs3,    vs16            // real*real, imag*real
+       xvmuldp         vs39,   vs3,    vs17            // real*imag, imag*imag
+       xvmuldp         vs40,   vs4,    vs16            // real*real, imag*real
+       xvmuldp         vs41,   vs4,    vs17            // real*imag, imag*imag
+       xvmuldp         vs42,   vs5,    vs16            // real*real, imag*real
+       xvmuldp         vs43,   vs5,    vs17            // real*imag, imag*imag
+       xvmuldp         vs44,   vs6,    vs16            // real*real, imag*real
+       xvmuldp         vs45,   vs6,    vs17            // real*imag, imag*imag
+       xvmuldp         vs46,   vs7,    vs16            // real*real, imag*real
+       xvmuldp         vs47,   vs7,    vs17            // real*imag, imag*imag
+
+       xvmuldp         vs48,   vs0,    vs18            // real*real, imag*real
+       xvmuldp         vs49,   vs0,    vs19            // real*imag, imag*imag
+       xvmuldp         vs50,   vs1,    vs18            // real*real, imag*real
+       xvmuldp         vs51,   vs1,    vs19            // real*imag, imag*imag
+       xvmuldp         vs52,   vs2,    vs18            // real*real, imag*real
+       xvmuldp         vs53,   vs2,    vs19            // real*imag, imag*imag
+       xvmuldp         vs54,   vs3,    vs18            // real*real, imag*real
+       xvmuldp         vs55,   vs3,    vs19            // real*imag, imag*imag
+       xvmuldp         vs56,   vs4,    vs18            // real*real, imag*real
+       xvmuldp         vs57,   vs4,    vs19            // real*imag, imag*imag
+       xvmuldp         vs58,   vs5,    vs18            // real*real, imag*real
+       xvmuldp         vs59,   vs5,    vs19            // real*imag, imag*imag
+       xvmuldp         vs60,   vs6,    vs18            // real*real, imag*real
+       xvmuldp         vs61,   vs6,    vs19            // real*imag, imag*imag
+       xvmuldp         vs62,   vs7,    vs18            // real*real, imag*real
+       xvmuldp         vs63,   vs7,    vs19            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x8_1
+
+
+       xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
+       xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs34,   vs1,    vs16            // real*real, imag*real
+       xvmaddadp       vs35,   vs1,    vs17            // real*imag, imag*imag
+
+       lxvdsx          vs22,   o16,    BO              // load real part from B
+       lxvdsx          vs23,   o24,    BO              // load imag part from B
+
+       xvmaddadp       vs36,   vs2,    vs16            // real*real, imag*real
+       xvmaddadp       vs37,   vs2,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs38,   vs3,    vs16            // real*real, imag*real
+       xvmaddadp       vs39,   vs3,    vs17            // real*imag, imag*imag
+
+       lxvd2x          vs8,    o0,     AO              // load real,imag from A
+       lxvd2x          vs9,    o16,    AO              // load real,imag from A
+
+       xvmaddadp       vs40,   vs4,    vs16            // real*real, imag*real
+       xvmaddadp       vs41,   vs4,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs42,   vs5,    vs16            // real*real, imag*real
+       xvmaddadp       vs43,   vs5,    vs17            // real*imag, imag*imag
+
+       lxvd2x          vs10,   o32,    AO              // load real,imag from A
+       lxvd2x          vs11,   o48,    AO              // load real,imag from A
+
+       xvmaddadp       vs44,   vs6,    vs16            // real*real, imag*real
+       xvmaddadp       vs45,   vs6,    vs17            // real*imag, imag*imag
+
+       addi            AO,     AO,     64
+
+       xvmaddadp       vs46,   vs7,    vs16            // real*real, imag*real
+       xvmaddadp       vs47,   vs7,    vs17            // real*imag, imag*imag
+
+       xvmaddadp       vs48,   vs0,    vs18            // real*real, imag*real
+       xvmaddadp       vs49,   vs0,    vs19            // real*imag, imag*imag
+       xvmaddadp       vs50,   vs1,    vs18            // real*real, imag*real
+       xvmaddadp       vs51,   vs1,    vs19            // real*imag, imag*imag
+
+       lxvd2x          vs12,   o0,     AO              // load real,imag from A
+       lxvd2x          vs13,   o16,    AO              // load real,imag from A
+
+       xvmaddadp       vs52,   vs2,    vs18            // real*real, imag*real
+       xvmaddadp       vs53,   vs2,    vs19            // real*imag, imag*imag
+       xvmaddadp       vs54,   vs3,    vs18            // real*real, imag*real
+       xvmaddadp       vs55,   vs3,    vs19            // real*imag, imag*imag
+
+       lxvd2x          vs14,   o32,    AO              // load real,imag from A
+       lxvd2x          vs15,   o48,    AO              // load real,imag from A
+
+       xvmaddadp       vs56,   vs4,    vs18            // real*real, imag*real
+       xvmaddadp       vs57,   vs4,    vs19            // real*imag, imag*imag
+       xvmaddadp       vs58,   vs5,    vs18            // real*real, imag*real
+       xvmaddadp       vs59,   vs5,    vs19            // real*imag, imag*imag
+
+       lxvdsx          vs20,   o0,     BO              // load real part from B
+       lxvdsx          vs21,   o8,     BO              // load imag part from B
+
+       xvmaddadp       vs60,   vs6,    vs18            // real*real, imag*real
+       xvmaddadp       vs61,   vs6,    vs19            // real*imag, imag*imag
+       xvmaddadp       vs62,   vs7,    vs18            // real*real, imag*real
+       xvmaddadp       vs63,   vs7,    vs19            // real*imag, imag*imag
+
+       addi            AO,     AO,     64
+       addi            BO,     BO,     32
+
+.endm
+
+.macro KERNEL2x8_2
+
+
+       xvmaddadp       vs32,   vs8,    vs20            // real*real, imag*real
+       xvmaddadp       vs33,   vs8,    vs21            // real*imag, imag*imag
+       xvmaddadp       vs34,   vs9,    vs20            // real*real, imag*real
+       xvmaddadp       vs35,   vs9,    vs21            // real*imag, imag*imag
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+
+       xvmaddadp       vs36,   vs10,   vs20            // real*real, imag*real
+       xvmaddadp       vs37,   vs10,   vs21            // real*imag, imag*imag
+       xvmaddadp       vs38,   vs11,   vs20            // real*real, imag*real
+       xvmaddadp       vs39,   vs11,   vs21            // real*imag, imag*imag
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+       lxvd2x          vs1,    o16,    AO              // load real,imag from A
+
+       xvmaddadp       vs40,   vs12,   vs20            // real*real, imag*real
+       xvmaddadp       vs41,   vs12,   vs21            // real*imag, imag*imag
+       xvmaddadp       vs42,   vs13,   vs20            // real*real, imag*real
+       xvmaddadp       vs43,   vs13,   vs21            // real*imag, imag*imag
+
+       lxvd2x          vs2,    o32,    AO              // load real,imag from A
+       lxvd2x          vs3,    o48,    AO              // load real,imag from A
+
+       xvmaddadp       vs44,   vs14,   vs20            // real*real, imag*real
+       xvmaddadp       vs45,   vs14,   vs21            // real*imag, imag*imag
+       xvmaddadp       vs46,   vs15,   vs20            // real*real, imag*real
+       xvmaddadp       vs47,   vs15,   vs21            // real*imag, imag*imag
+
+       addi            AO,     AO,     64
+
+       xvmaddadp       vs48,   vs8,    vs22            // real*real, imag*real
+       xvmaddadp       vs49,   vs8,    vs23            // real*imag, imag*imag
+       xvmaddadp       vs50,   vs9,    vs22            // real*real, imag*real
+       xvmaddadp       vs51,   vs9,    vs23            // real*imag, imag*imag
+
+       lxvd2x          vs4,    o0,     AO              // load real,imag from A
+       lxvd2x          vs5,    o16,    AO              // load real,imag from A
+
+       xvmaddadp       vs52,   vs10,   vs22            // real*real, imag*real
+       xvmaddadp       vs53,   vs10,   vs23            // real*imag, imag*imag
+       xvmaddadp       vs54,   vs11,   vs22            // real*real, imag*real
+       xvmaddadp       vs55,   vs11,   vs23            // real*imag, imag*imag
+
+       lxvd2x          vs6,    o32,    AO              // load real,imag from A
+       lxvd2x          vs7,    o48,    AO              // load real,imag from A
+
+       xvmaddadp       vs56,   vs12,   vs22            // real*real, imag*real
+       xvmaddadp       vs57,   vs12,   vs23            // real*imag, imag*imag
+       xvmaddadp       vs58,   vs13,   vs22            // real*real, imag*real
+       xvmaddadp       vs59,   vs13,   vs23            // real*imag, imag*imag
+
+       lxvdsx          vs18,   o16,    BO              // load real part from B
+       lxvdsx          vs19,   o24,    BO              // load imag part from B
+
+       xvmaddadp       vs60,   vs14,   vs22            // real*real, imag*real
+       xvmaddadp       vs61,   vs14,   vs23            // real*imag, imag*imag
+       xvmaddadp       vs62,   vs15,   vs22            // real*real, imag*real
+       xvmaddadp       vs63,   vs15,   vs23            // real*imag, imag*imag
+
+       addi            AO,     AO,     64
+       addi            BO,     BO,     32
+
+.endm
+
+.macro KERNEL2x8_E2
+
+
+       xvmaddadp       vs32,   vs8,    vs20            // real*real, imag*real
+       xvmaddadp       vs33,   vs8,    vs21            // real*imag, imag*imag
+       xvmaddadp       vs34,   vs9,    vs20            // real*real, imag*real
+       xvmaddadp       vs35,   vs9,    vs21            // real*imag, imag*imag
+       xvmaddadp       vs36,   vs10,   vs20            // real*real, imag*real
+       xvmaddadp       vs37,   vs10,   vs21            // real*imag, imag*imag
+       xvmaddadp       vs38,   vs11,   vs20            // real*real, imag*real
+       xvmaddadp       vs39,   vs11,   vs21            // real*imag, imag*imag
+       xvmaddadp       vs40,   vs12,   vs20            // real*real, imag*real
+       xvmaddadp       vs41,   vs12,   vs21            // real*imag, imag*imag
+       xvmaddadp       vs42,   vs13,   vs20            // real*real, imag*real
+       xvmaddadp       vs43,   vs13,   vs21            // real*imag, imag*imag
+       xvmaddadp       vs44,   vs14,   vs20            // real*real, imag*real
+       xvmaddadp       vs45,   vs14,   vs21            // real*imag, imag*imag
+       xvmaddadp       vs46,   vs15,   vs20            // real*real, imag*real
+       xvmaddadp       vs47,   vs15,   vs21            // real*imag, imag*imag
+
+       xvmaddadp       vs48,   vs8,    vs22            // real*real, imag*real
+       xvmaddadp       vs49,   vs8,    vs23            // real*imag, imag*imag
+       xvmaddadp       vs50,   vs9,    vs22            // real*real, imag*real
+       xvmaddadp       vs51,   vs9,    vs23            // real*imag, imag*imag
+       xvmaddadp       vs52,   vs10,   vs22            // real*real, imag*real
+       xvmaddadp       vs53,   vs10,   vs23            // real*imag, imag*imag
+       xvmaddadp       vs54,   vs11,   vs22            // real*real, imag*real
+       xvmaddadp       vs55,   vs11,   vs23            // real*imag, imag*imag
+       xvmaddadp       vs56,   vs12,   vs22            // real*real, imag*real
+       xvmaddadp       vs57,   vs12,   vs23            // real*imag, imag*imag
+       xvmaddadp       vs58,   vs13,   vs22            // real*real, imag*real
+       xvmaddadp       vs59,   vs13,   vs23            // real*imag, imag*imag
+       xvmaddadp       vs60,   vs14,   vs22            // real*real, imag*real
+       xvmaddadp       vs61,   vs14,   vs23            // real*imag, imag*imag
+       xvmaddadp       vs62,   vs15,   vs22            // real*real, imag*real
+       xvmaddadp       vs63,   vs15,   vs23            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x8_SUBI1
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+       lxvd2x          vs1,    o16,    AO              // load real,imag from A
+       lxvd2x          vs2,    o32,    AO              // load real,imag from A
+       lxvd2x          vs3,    o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvd2x          vs4,    o0,     AO              // load real,imag from A
+       lxvd2x          vs5,    o16,    AO              // load real,imag from A
+       lxvd2x          vs6,    o32,    AO              // load real,imag from A
+       lxvd2x          vs7,    o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+       lxvdsx          vs18,   o16,    BO              // load real part from B
+       lxvdsx          vs19,   o24,    BO              // load imag part from B
+
+       addi            BO,     BO,     32
+
+       xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
+       xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
+       xvmuldp         vs34,   vs1,    vs16            // real*real, imag*real
+       xvmuldp         vs35,   vs1,    vs17            // real*imag, imag*imag
+       xvmuldp         vs36,   vs2,    vs16            // real*real, imag*real
+       xvmuldp         vs37,   vs2,    vs17            // real*imag, imag*imag
+       xvmuldp         vs38,   vs3,    vs16            // real*real, imag*real
+       xvmuldp         vs39,   vs3,    vs17            // real*imag, imag*imag
+       xvmuldp         vs40,   vs4,    vs16            // real*real, imag*real
+       xvmuldp         vs41,   vs4,    vs17            // real*imag, imag*imag
+       xvmuldp         vs42,   vs5,    vs16            // real*real, imag*real
+       xvmuldp         vs43,   vs5,    vs17            // real*imag, imag*imag
+       xvmuldp         vs44,   vs6,    vs16            // real*real, imag*real
+       xvmuldp         vs45,   vs6,    vs17            // real*imag, imag*imag
+       xvmuldp         vs46,   vs7,    vs16            // real*real, imag*real
+       xvmuldp         vs47,   vs7,    vs17            // real*imag, imag*imag
+
+       xvmuldp         vs48,   vs0,    vs18            // real*real, imag*real
+       xvmuldp         vs49,   vs0,    vs19            // real*imag, imag*imag
+       xvmuldp         vs50,   vs1,    vs18            // real*real, imag*real
+       xvmuldp         vs51,   vs1,    vs19            // real*imag, imag*imag
+       xvmuldp         vs52,   vs2,    vs18            // real*real, imag*real
+       xvmuldp         vs53,   vs2,    vs19            // real*imag, imag*imag
+       xvmuldp         vs54,   vs3,    vs18            // real*real, imag*real
+       xvmuldp         vs55,   vs3,    vs19            // real*imag, imag*imag
+       xvmuldp         vs56,   vs4,    vs18            // real*real, imag*real
+       xvmuldp         vs57,   vs4,    vs19            // real*imag, imag*imag
+       xvmuldp         vs58,   vs5,    vs18            // real*real, imag*real
+       xvmuldp         vs59,   vs5,    vs19            // real*imag, imag*imag
+       xvmuldp         vs60,   vs6,    vs18            // real*real, imag*real
+       xvmuldp         vs61,   vs6,    vs19            // real*imag, imag*imag
+       xvmuldp         vs62,   vs7,    vs18            // real*real, imag*real
+       xvmuldp         vs63,   vs7,    vs19            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x8_SUB1
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+       lxvd2x          vs1,    o16,    AO              // load real,imag from A
+       lxvd2x          vs2,    o32,    AO              // load real,imag from A
+       lxvd2x          vs3,    o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvd2x          vs4,    o0,     AO              // load real,imag from A
+       lxvd2x          vs5,    o16,    AO              // load real,imag from A
+       lxvd2x          vs6,    o32,    AO              // load real,imag from A
+       lxvd2x          vs7,    o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+       lxvdsx          vs18,   o16,    BO              // load real part from B
+       lxvdsx          vs19,   o24,    BO              // load imag part from B
+
+       addi            BO,     BO,     32
+
+       xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
+       xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs34,   vs1,    vs16            // real*real, imag*real
+       xvmaddadp       vs35,   vs1,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs36,   vs2,    vs16            // real*real, imag*real
+       xvmaddadp       vs37,   vs2,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs38,   vs3,    vs16            // real*real, imag*real
+       xvmaddadp       vs39,   vs3,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs40,   vs4,    vs16            // real*real, imag*real
+       xvmaddadp       vs41,   vs4,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs42,   vs5,    vs16            // real*real, imag*real
+       xvmaddadp       vs43,   vs5,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs44,   vs6,    vs16            // real*real, imag*real
+       xvmaddadp       vs45,   vs6,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs46,   vs7,    vs16            // real*real, imag*real
+       xvmaddadp       vs47,   vs7,    vs17            // real*imag, imag*imag
+
+       xvmaddadp       vs48,   vs0,    vs18            // real*real, imag*real
+       xvmaddadp       vs49,   vs0,    vs19            // real*imag, imag*imag
+       xvmaddadp       vs50,   vs1,    vs18            // real*real, imag*real
+       xvmaddadp       vs51,   vs1,    vs19            // real*imag, imag*imag
+       xvmaddadp       vs52,   vs2,    vs18            // real*real, imag*real
+       xvmaddadp       vs53,   vs2,    vs19            // real*imag, imag*imag
+       xvmaddadp       vs54,   vs3,    vs18            // real*real, imag*real
+       xvmaddadp       vs55,   vs3,    vs19            // real*imag, imag*imag
+       xvmaddadp       vs56,   vs4,    vs18            // real*real, imag*real
+       xvmaddadp       vs57,   vs4,    vs19            // real*imag, imag*imag
+       xvmaddadp       vs58,   vs5,    vs18            // real*real, imag*real
+       xvmaddadp       vs59,   vs5,    vs19            // real*imag, imag*imag
+       xvmaddadp       vs60,   vs6,    vs18            // real*real, imag*real
+       xvmaddadp       vs61,   vs6,    vs19            // real*imag, imag*imag
+       xvmaddadp       vs62,   vs7,    vs18            // real*real, imag*real
+       xvmaddadp       vs63,   vs7,    vs19            // real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE2x8
+
+
+       mr              T1,     CO
+       addi            T2,     T1,     64
+
+#ifndef TRMMKERNEL
+
+       lxvd2x          vs16,   o0,     T1
+       lxvd2x          vs17,   o16,    T1
+       lxvd2x          vs18,   o32,    T1
+       lxvd2x          vs19,   o48,    T1
+       lxvd2x          vs20,   o0,     T2
+       lxvd2x          vs21,   o16,    T2
+       lxvd2x          vs22,   o32,    T2
+       lxvd2x          vs23,   o48,    T2
+
+#endif
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs33,   vs33                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs32            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs33            // imagA*imagB
+
+       xxswapd         vs32,   vs32                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs33,   vs33                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs32            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs33            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs8,    vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs35,   vs35                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs34            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs35            // imagA*imagB
+
+       xxswapd         vs34,   vs34                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs35,   vs35                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs34            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs35            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs9,    vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs37,   vs37                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs36            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs37            // imagA*imagB
+
+       xxswapd         vs36,   vs36                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs37,   vs37                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs36            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs37            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs10,   vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs39,   vs39                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs38            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs39            // imagA*imagB
+
+       xxswapd         vs38,   vs38                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs39,   vs39                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs38            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs39            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs11,   vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs41,   vs41                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs40            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs41            // imagA*imagB
+
+       xxswapd         vs40,   vs40                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs41,   vs41                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs40            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs41            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs12,   vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs43,   vs43                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs42            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs43            // imagA*imagB
+
+       xxswapd         vs42,   vs42                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs43,   vs43                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs42            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs43            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs13,   vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs45,   vs45                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs44            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs45            // imagA*imagB
+
+       xxswapd         vs44,   vs44                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs45,   vs45                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs44            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs45            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs14,   vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs47,   vs47                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs46            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs47            // imagA*imagB
+
+       xxswapd         vs46,   vs46                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs47,   vs47                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs46            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs47            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs15,   vs2,    vs3,    0       // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+       xvadddp         vs8,    vs8,    vs16
+       xvadddp         vs9,    vs9,    vs17
+       xvadddp         vs10,   vs10,   vs18
+       xvadddp         vs11,   vs11,   vs19
+       xvadddp         vs12,   vs12,   vs20
+       xvadddp         vs13,   vs13,   vs21
+       xvadddp         vs14,   vs14,   vs22
+       xvadddp         vs15,   vs15,   vs23
+
+#endif
+
+       stxvd2x         vs8,    o0,     T1
+       stxvd2x         vs9,    o16,    T1
+       stxvd2x         vs10,   o32,    T1
+       stxvd2x         vs11,   o48,    T1
+       stxvd2x         vs12,   o0,     T2
+       stxvd2x         vs13,   o16,    T2
+       stxvd2x         vs14,   o32,    T2
+       stxvd2x         vs15,   o48,    T2
+
+       add             T1,     T1,     LDC
+       add             T2,     T2,     LDC
+
+#ifndef TRMMKERNEL
+
+       lxvd2x          vs16,   o0,     T1
+       lxvd2x          vs17,   o16,    T1
+       lxvd2x          vs18,   o32,    T1
+       lxvd2x          vs19,   o48,    T1
+       lxvd2x          vs20,   o0,     T2
+       lxvd2x          vs21,   o16,    T2
+       lxvd2x          vs22,   o32,    T2
+       lxvd2x          vs23,   o48,    T2
+
+#endif
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs49,   vs49                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs48            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs49            // imagA*imagB
+
+       xxswapd         vs48,   vs48                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs49,   vs49                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs48            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs49            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs8,    vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs51,   vs51                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs50            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs51            // imagA*imagB
+
+       xxswapd         vs50,   vs50                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs51,   vs51                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs50            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs51            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs9,    vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs53,   vs53                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs52            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs53            // imagA*imagB
+
+       xxswapd         vs52,   vs52                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs53,   vs53                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs52            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs53            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs10,   vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs55,   vs55                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs54            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs55            // imagA*imagB
+
+       xxswapd         vs54,   vs54                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs55,   vs55                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs54            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs55            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs11,   vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs57,   vs57                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs56            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs57            // imagA*imagB
+
+       xxswapd         vs56,   vs56                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs57,   vs57                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs56            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs57            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs12,   vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs59,   vs59                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs58            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs59            // imagA*imagB
+
+       xxswapd         vs58,   vs58                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs59,   vs59                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs58            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs59            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs13,   vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs61,   vs61                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs60            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs61            // imagA*imagB
+
+       xxswapd         vs60,   vs60                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs61,   vs61                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs60            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs61            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs14,   vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs63,   vs63                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs62            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs63            // imagA*imagB
+
+       xxswapd         vs62,   vs62                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs63,   vs63                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs62            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs63            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs15,   vs2,    vs3,    0       // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+       xvadddp         vs8,    vs8,    vs16
+       xvadddp         vs9,    vs9,    vs17
+       xvadddp         vs10,   vs10,   vs18
+       xvadddp         vs11,   vs11,   vs19
+       xvadddp         vs12,   vs12,   vs20
+       xvadddp         vs13,   vs13,   vs21
+       xvadddp         vs14,   vs14,   vs22
+       xvadddp         vs15,   vs15,   vs23
+
+#endif
+
+       stxvd2x         vs8,    o0,     T1
+       stxvd2x         vs9,    o16,    T1
+       stxvd2x         vs10,   o32,    T1
+       stxvd2x         vs11,   o48,    T1
+       stxvd2x         vs12,   o0,     T2
+       stxvd2x         vs13,   o16,    T2
+       stxvd2x         vs14,   o32,    T2
+       stxvd2x         vs15,   o48,    T2
+
+       add             T1,     T1,     LDC
+       add             T2,     T2,     LDC
+       addi            CO,     CO,     128
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=4
+**********************************************************************************************/
+
+.macro LOAD2x4_1
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+       lxvdsx          vs18,   o16,    BO              // load real part from B
+       lxvdsx          vs19,   o24,    BO              // load imag part from B
+
+       addi            BO,     BO,     32
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+       lxvd2x          vs1,    o16,    AO              // load real,imag from A
+       lxvd2x          vs2,    o32,    AO              // load real,imag from A
+       lxvd2x          vs3,    o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+
+.endm
+
+.macro KERNEL2x4_I1
+
+       lxvd2x          vs8,    o0,     AO              // load real,imag from A
+       lxvd2x          vs9,    o16,    AO              // load real,imag from A
+       lxvd2x          vs10,   o32,    AO              // load real,imag from A
+       lxvd2x          vs11,   o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvdsx          vs20,   o0,     BO              // load real part from B
+       lxvdsx          vs21,   o8,     BO              // load imag part from B
+       lxvdsx          vs22,   o16,    BO              // load real part from B
+       lxvdsx          vs23,   o24,    BO              // load imag part from B
+
+       addi            BO,     BO,     32
+
+       xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
+       xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
+       xvmuldp         vs34,   vs1,    vs16            // real*real, imag*real
+       xvmuldp         vs35,   vs1,    vs17            // real*imag, imag*imag
+       xvmuldp         vs36,   vs2,    vs16            // real*real, imag*real
+       xvmuldp         vs37,   vs2,    vs17            // real*imag, imag*imag
+       xvmuldp         vs38,   vs3,    vs16            // real*real, imag*real
+       xvmuldp         vs39,   vs3,    vs17            // real*imag, imag*imag
+
+       xvmuldp         vs40,   vs0,    vs18            // real*real, imag*real
+       xvmuldp         vs41,   vs0,    vs19            // real*imag, imag*imag
+       xvmuldp         vs42,   vs1,    vs18            // real*real, imag*real
+       xvmuldp         vs43,   vs1,    vs19            // real*imag, imag*imag
+       xvmuldp         vs44,   vs2,    vs18            // real*real, imag*real
+       xvmuldp         vs45,   vs2,    vs19            // real*imag, imag*imag
+       xvmuldp         vs46,   vs3,    vs18            // real*real, imag*real
+       xvmuldp         vs47,   vs3,    vs19            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x4_1
+
+       lxvd2x          vs8,    o0,     AO              // load real,imag from A
+       lxvd2x          vs9,    o16,    AO              // load real,imag from A
+       lxvd2x          vs10,   o32,    AO              // load real,imag from A
+       lxvd2x          vs11,   o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvdsx          vs20,   o0,     BO              // load real part from B
+       lxvdsx          vs21,   o8,     BO              // load imag part from B
+       lxvdsx          vs22,   o16,    BO              // load real part from B
+       lxvdsx          vs23,   o24,    BO              // load imag part from B
+
+       addi            BO,     BO,     32
+
+       xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
+       xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs34,   vs1,    vs16            // real*real, imag*real
+       xvmaddadp       vs35,   vs1,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs36,   vs2,    vs16            // real*real, imag*real
+       xvmaddadp       vs37,   vs2,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs38,   vs3,    vs16            // real*real, imag*real
+       xvmaddadp       vs39,   vs3,    vs17            // real*imag, imag*imag
+
+       xvmaddadp       vs40,   vs0,    vs18            // real*real, imag*real
+       xvmaddadp       vs41,   vs0,    vs19            // real*imag, imag*imag
+       xvmaddadp       vs42,   vs1,    vs18            // real*real, imag*real
+       xvmaddadp       vs43,   vs1,    vs19            // real*imag, imag*imag
+       xvmaddadp       vs44,   vs2,    vs18            // real*real, imag*real
+       xvmaddadp       vs45,   vs2,    vs19            // real*imag, imag*imag
+       xvmaddadp       vs46,   vs3,    vs18            // real*real, imag*real
+       xvmaddadp       vs47,   vs3,    vs19            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x4_2
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+       lxvd2x          vs1,    o16,    AO              // load real,imag from A
+       lxvd2x          vs2,    o32,    AO              // load real,imag from A
+       lxvd2x          vs3,    o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+       lxvdsx          vs18,   o16,    BO              // load real part from B
+       lxvdsx          vs19,   o24,    BO              // load imag part from B
+
+       addi            BO,     BO,     32
+
+       xvmaddadp       vs32,   vs8,    vs20            // real*real, imag*real
+       xvmaddadp       vs33,   vs8,    vs21            // real*imag, imag*imag
+       xvmaddadp       vs34,   vs9,    vs20            // real*real, imag*real
+       xvmaddadp       vs35,   vs9,    vs21            // real*imag, imag*imag
+       xvmaddadp       vs36,   vs10,   vs20            // real*real, imag*real
+       xvmaddadp       vs37,   vs10,   vs21            // real*imag, imag*imag
+       xvmaddadp       vs38,   vs11,   vs20            // real*real, imag*real
+       xvmaddadp       vs39,   vs11,   vs21            // real*imag, imag*imag
+
+       xvmaddadp       vs40,   vs8,    vs22            // real*real, imag*real
+       xvmaddadp       vs41,   vs8,    vs23            // real*imag, imag*imag
+       xvmaddadp       vs42,   vs9,    vs22            // real*real, imag*real
+       xvmaddadp       vs43,   vs9,    vs23            // real*imag, imag*imag
+       xvmaddadp       vs44,   vs10,   vs22            // real*real, imag*real
+       xvmaddadp       vs45,   vs10,   vs23            // real*imag, imag*imag
+       xvmaddadp       vs46,   vs11,   vs22            // real*real, imag*real
+       xvmaddadp       vs47,   vs11,   vs23            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x4_E2
+
+
+       xvmaddadp       vs32,   vs8,    vs20            // real*real, imag*real
+       xvmaddadp       vs33,   vs8,    vs21            // real*imag, imag*imag
+       xvmaddadp       vs34,   vs9,    vs20            // real*real, imag*real
+       xvmaddadp       vs35,   vs9,    vs21            // real*imag, imag*imag
+       xvmaddadp       vs36,   vs10,   vs20            // real*real, imag*real
+       xvmaddadp       vs37,   vs10,   vs21            // real*imag, imag*imag
+       xvmaddadp       vs38,   vs11,   vs20            // real*real, imag*real
+       xvmaddadp       vs39,   vs11,   vs21            // real*imag, imag*imag
+
+       xvmaddadp       vs40,   vs8,    vs22            // real*real, imag*real
+       xvmaddadp       vs41,   vs8,    vs23            // real*imag, imag*imag
+       xvmaddadp       vs42,   vs9,    vs22            // real*real, imag*real
+       xvmaddadp       vs43,   vs9,    vs23            // real*imag, imag*imag
+       xvmaddadp       vs44,   vs10,   vs22            // real*real, imag*real
+       xvmaddadp       vs45,   vs10,   vs23            // real*imag, imag*imag
+       xvmaddadp       vs46,   vs11,   vs22            // real*real, imag*real
+       xvmaddadp       vs47,   vs11,   vs23            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x4_SUBI1
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+       lxvd2x          vs1,    o16,    AO              // load real,imag from A
+       lxvd2x          vs2,    o32,    AO              // load real,imag from A
+       lxvd2x          vs3,    o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+       lxvdsx          vs18,   o16,    BO              // load real part from B
+       lxvdsx          vs19,   o24,    BO              // load imag part from B
+
+       addi            BO,     BO,     32
+
+       xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
+       xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
+       xvmuldp         vs34,   vs1,    vs16            // real*real, imag*real
+       xvmuldp         vs35,   vs1,    vs17            // real*imag, imag*imag
+       xvmuldp         vs36,   vs2,    vs16            // real*real, imag*real
+       xvmuldp         vs37,   vs2,    vs17            // real*imag, imag*imag
+       xvmuldp         vs38,   vs3,    vs16            // real*real, imag*real
+       xvmuldp         vs39,   vs3,    vs17            // real*imag, imag*imag
+
+       xvmuldp         vs40,   vs0,    vs18            // real*real, imag*real
+       xvmuldp         vs41,   vs0,    vs19            // real*imag, imag*imag
+       xvmuldp         vs42,   vs1,    vs18            // real*real, imag*real
+       xvmuldp         vs43,   vs1,    vs19            // real*imag, imag*imag
+       xvmuldp         vs44,   vs2,    vs18            // real*real, imag*real
+       xvmuldp         vs45,   vs2,    vs19            // real*imag, imag*imag
+       xvmuldp         vs46,   vs3,    vs18            // real*real, imag*real
+       xvmuldp         vs47,   vs3,    vs19            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x4_SUB1
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+       lxvd2x          vs1,    o16,    AO              // load real,imag from A
+       lxvd2x          vs2,    o32,    AO              // load real,imag from A
+       lxvd2x          vs3,    o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+       lxvdsx          vs18,   o16,    BO              // load real part from B
+       lxvdsx          vs19,   o24,    BO              // load imag part from B
+
+       addi            BO,     BO,     32
+
+       xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
+       xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs34,   vs1,    vs16            // real*real, imag*real
+       xvmaddadp       vs35,   vs1,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs36,   vs2,    vs16            // real*real, imag*real
+       xvmaddadp       vs37,   vs2,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs38,   vs3,    vs16            // real*real, imag*real
+       xvmaddadp       vs39,   vs3,    vs17            // real*imag, imag*imag
+
+       xvmaddadp       vs40,   vs0,    vs18            // real*real, imag*real
+       xvmaddadp       vs41,   vs0,    vs19            // real*imag, imag*imag
+       xvmaddadp       vs42,   vs1,    vs18            // real*real, imag*real
+       xvmaddadp       vs43,   vs1,    vs19            // real*imag, imag*imag
+       xvmaddadp       vs44,   vs2,    vs18            // real*real, imag*real
+       xvmaddadp       vs45,   vs2,    vs19            // real*imag, imag*imag
+       xvmaddadp       vs46,   vs3,    vs18            // real*real, imag*real
+       xvmaddadp       vs47,   vs3,    vs19            // real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE2x4
+
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+
+       lxvd2x          vs16,   o0,     T1
+       lxvd2x          vs17,   o16,    T1
+       lxvd2x          vs18,   o32,    T1
+       lxvd2x          vs19,   o48,    T1
+
+#endif
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs33,   vs33                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs32            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs33            // imagA*imagB
+
+       xxswapd         vs32,   vs32                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs33,   vs33                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs32            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs33            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs8,    vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs35,   vs35                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs34            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs35            // imagA*imagB
+
+       xxswapd         vs34,   vs34                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs35,   vs35                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs34            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs35            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs9,    vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs37,   vs37                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs36            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs37            // imagA*imagB
+
+       xxswapd         vs36,   vs36                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs37,   vs37                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs36            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs37            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs10,   vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs39,   vs39                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs38            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs39            // imagA*imagB
+
+       xxswapd         vs38,   vs38                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs39,   vs39                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs38            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs39            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs11,   vs2,    vs3,    0       // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+       xvadddp         vs8,    vs8,    vs16
+       xvadddp         vs9,    vs9,    vs17
+       xvadddp         vs10,   vs10,   vs18
+       xvadddp         vs11,   vs11,   vs19
+
+#endif
+
+       stxvd2x         vs8,    o0,     T1
+       stxvd2x         vs9,    o16,    T1
+       stxvd2x         vs10,   o32,    T1
+       stxvd2x         vs11,   o48,    T1
+
+       add             T1,     T1,     LDC
+
+#ifndef TRMMKERNEL
+
+       lxvd2x          vs16,   o0,     T1
+       lxvd2x          vs17,   o16,    T1
+       lxvd2x          vs18,   o32,    T1
+       lxvd2x          vs19,   o48,    T1
+
+#endif
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs41,   vs41                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs40            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs41            // imagA*imagB
+
+       xxswapd         vs40,   vs40                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs41,   vs41                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs40            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs41            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs8,    vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs43,   vs43                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs42            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs43            // imagA*imagB
+
+       xxswapd         vs42,   vs42                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs43,   vs43                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs42            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs43            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs9,    vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs45,   vs45                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs44            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs45            // imagA*imagB
+
+       xxswapd         vs44,   vs44                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs45,   vs45                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs44            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs45            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs10,   vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs47,   vs47                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs46            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs47            // imagA*imagB
+
+       xxswapd         vs46,   vs46                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs47,   vs47                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs46            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs47            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs11,   vs2,    vs3,    0       // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+       xvadddp         vs8,    vs8,    vs16
+       xvadddp         vs9,    vs9,    vs17
+       xvadddp         vs10,   vs10,   vs18
+       xvadddp         vs11,   vs11,   vs19
+
+#endif
+
+       stxvd2x         vs8,    o0,     T1
+       stxvd2x         vs9,    o16,    T1
+       stxvd2x         vs10,   o32,    T1
+       stxvd2x         vs11,   o48,    T1
+
+       add             T1,     T1,     LDC
+       addi            CO,     CO,     64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=2
+**********************************************************************************************/
+
+.macro LOAD2x2_1
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+       lxvdsx          vs18,   o16,    BO              // load real part from B
+       lxvdsx          vs19,   o24,    BO              // load imag part from B
+
+       addi            BO,     BO,     32
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+       lxvd2x          vs1,    o16,    AO              // load real,imag from A
+
+       addi            AO,     AO,     32
+
+
+.endm
+
+.macro KERNEL2x2_I1
+
+       lxvd2x          vs8,    o0,     AO              // load real,imag from A
+       lxvd2x          vs9,    o16,    AO              // load real,imag from A
+
+       addi            AO,     AO,     32
+
+       lxvdsx          vs20,   o0,     BO              // load real part from B
+       lxvdsx          vs21,   o8,     BO              // load imag part from B
+       lxvdsx          vs22,   o16,    BO              // load real part from B
+       lxvdsx          vs23,   o24,    BO              // load imag part from B
+
+       addi            BO,     BO,     32
+
+       xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
+       xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
+       xvmuldp         vs34,   vs1,    vs16            // real*real, imag*real
+       xvmuldp         vs35,   vs1,    vs17            // real*imag, imag*imag
+
+       xvmuldp         vs36,   vs0,    vs18            // real*real, imag*real
+       xvmuldp         vs37,   vs0,    vs19            // real*imag, imag*imag
+       xvmuldp         vs38,   vs1,    vs18            // real*real, imag*real
+       xvmuldp         vs39,   vs1,    vs19            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x2_1
+
+       lxvd2x          vs8,    o0,     AO              // load real,imag from A
+       lxvd2x          vs9,    o16,    AO              // load real,imag from A
+
+       addi            AO,     AO,     32
+
+       lxvdsx          vs20,   o0,     BO              // load real part from B
+       lxvdsx          vs21,   o8,     BO              // load imag part from B
+       lxvdsx          vs22,   o16,    BO              // load real part from B
+       lxvdsx          vs23,   o24,    BO              // load imag part from B
+
+       addi            BO,     BO,     32
+
+       xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
+       xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs34,   vs1,    vs16            // real*real, imag*real
+       xvmaddadp       vs35,   vs1,    vs17            // real*imag, imag*imag
+
+       xvmaddadp       vs36,   vs0,    vs18            // real*real, imag*real
+       xvmaddadp       vs37,   vs0,    vs19            // real*imag, imag*imag
+       xvmaddadp       vs38,   vs1,    vs18            // real*real, imag*real
+       xvmaddadp       vs39,   vs1,    vs19            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x2_2
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+       lxvd2x          vs1,    o16,    AO              // load real,imag from A
+
+       addi            AO,     AO,     32
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+       lxvdsx          vs18,   o16,    BO              // load real part from B
+       lxvdsx          vs19,   o24,    BO              // load imag part from B
+
+       addi            BO,     BO,     32
+
+       xvmaddadp       vs32,   vs8,    vs20            // real*real, imag*real
+       xvmaddadp       vs33,   vs8,    vs21            // real*imag, imag*imag
+       xvmaddadp       vs34,   vs9,    vs20            // real*real, imag*real
+       xvmaddadp       vs35,   vs9,    vs21            // real*imag, imag*imag
+
+       xvmaddadp       vs36,   vs8,    vs22            // real*real, imag*real
+       xvmaddadp       vs37,   vs8,    vs23            // real*imag, imag*imag
+       xvmaddadp       vs38,   vs9,    vs22            // real*real, imag*real
+       xvmaddadp       vs39,   vs9,    vs23            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x2_E2
+
+
+       xvmaddadp       vs32,   vs8,    vs20            // real*real, imag*real
+       xvmaddadp       vs33,   vs8,    vs21            // real*imag, imag*imag
+       xvmaddadp       vs34,   vs9,    vs20            // real*real, imag*real
+       xvmaddadp       vs35,   vs9,    vs21            // real*imag, imag*imag
+
+       xvmaddadp       vs36,   vs8,    vs22            // real*real, imag*real
+       xvmaddadp       vs37,   vs8,    vs23            // real*imag, imag*imag
+       xvmaddadp       vs38,   vs9,    vs22            // real*real, imag*real
+       xvmaddadp       vs39,   vs9,    vs23            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x2_SUBI1
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+       lxvd2x          vs1,    o16,    AO              // load real,imag from A
+
+       addi            AO,     AO,     32
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+       lxvdsx          vs18,   o16,    BO              // load real part from B
+       lxvdsx          vs19,   o24,    BO              // load imag part from B
+
+       addi            BO,     BO,     32
+
+       xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
+       xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
+       xvmuldp         vs34,   vs1,    vs16            // real*real, imag*real
+       xvmuldp         vs35,   vs1,    vs17            // real*imag, imag*imag
+
+       xvmuldp         vs36,   vs0,    vs18            // real*real, imag*real
+       xvmuldp         vs37,   vs0,    vs19            // real*imag, imag*imag
+       xvmuldp         vs38,   vs1,    vs18            // real*real, imag*real
+       xvmuldp         vs39,   vs1,    vs19            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x2_SUB1
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+       lxvd2x          vs1,    o16,    AO              // load real,imag from A
+
+       addi            AO,     AO,     32
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+       lxvdsx          vs18,   o16,    BO              // load real part from B
+       lxvdsx          vs19,   o24,    BO              // load imag part from B
+
+       addi            BO,     BO,     32
+
+       xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
+       xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs34,   vs1,    vs16            // real*real, imag*real
+       xvmaddadp       vs35,   vs1,    vs17            // real*imag, imag*imag
+
+       xvmaddadp       vs36,   vs0,    vs18            // real*real, imag*real
+       xvmaddadp       vs37,   vs0,    vs19            // real*imag, imag*imag
+       xvmaddadp       vs38,   vs1,    vs18            // real*real, imag*real
+       xvmaddadp       vs39,   vs1,    vs19            // real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE2x2
+
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+
+       lxvd2x          vs16,   o0,     T1
+       lxvd2x          vs17,   o16,    T1
+
+#endif
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs33,   vs33                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs32            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs33            // imagA*imagB
+
+       xxswapd         vs32,   vs32                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs33,   vs33                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs32            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs33            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs8,    vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs35,   vs35                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs34            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs35            // imagA*imagB
+
+       xxswapd         vs34,   vs34                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs35,   vs35                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs34            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs35            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs9,    vs2,    vs3,    0       // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+       xvadddp         vs8,    vs8,    vs16
+       xvadddp         vs9,    vs9,    vs17
+
+#endif
+
+       stxvd2x         vs8,    o0,     T1
+       stxvd2x         vs9,    o16,    T1
+
+       add             T1,     T1,     LDC
+
+#ifndef TRMMKERNEL
+
+       lxvd2x          vs16,   o0,     T1
+       lxvd2x          vs17,   o16,    T1
+
+#endif
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs37,   vs37                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs36            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs37            // imagA*imagB
+
+       xxswapd         vs36,   vs36                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs37,   vs37                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs36            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs37            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs8,    vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs39,   vs39                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs38            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs39            // imagA*imagB
+
+       xxswapd         vs38,   vs38                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs39,   vs39                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs38            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs39            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs9,    vs2,    vs3,    0       // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+       xvadddp         vs8,    vs8,    vs16
+       xvadddp         vs9,    vs9,    vs17
+
+#endif
+
+       stxvd2x         vs8,    o0,     T1
+       stxvd2x         vs9,    o16,    T1
+
+       add             T1,     T1,     LDC
+       addi            CO,     CO,     32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=1
+**********************************************************************************************/
+
+.macro LOAD2x1_1
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+       lxvdsx          vs18,   o16,    BO              // load real part from B
+       lxvdsx          vs19,   o24,    BO              // load imag part from B
+
+       addi            BO,     BO,     32
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+
+       addi            AO,     AO,     16
+
+
+.endm
+
+.macro KERNEL2x1_I1
+
+       lxvd2x          vs8,    o0,     AO              // load real,imag from A
+
+       addi            AO,     AO,     16
+
+       lxvdsx          vs20,   o0,     BO              // load real part from B
+       lxvdsx          vs21,   o8,     BO              // load imag part from B
+       lxvdsx          vs22,   o16,    BO              // load real part from B
+       lxvdsx          vs23,   o24,    BO              // load imag part from B
+
+       addi            BO,     BO,     32
+
+       xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
+       xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
+
+       xvmuldp         vs34,   vs0,    vs18            // real*real, imag*real
+       xvmuldp         vs35,   vs0,    vs19            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x1_1
+
+       lxvd2x          vs8,    o0,     AO              // load real,imag from A
+
+       addi            AO,     AO,     16
+
+       lxvdsx          vs20,   o0,     BO              // load real part from B
+       lxvdsx          vs21,   o8,     BO              // load imag part from B
+       lxvdsx          vs22,   o16,    BO              // load real part from B
+       lxvdsx          vs23,   o24,    BO              // load imag part from B
+
+       addi            BO,     BO,     32
+
+       xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
+       xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
+
+       xvmaddadp       vs34,   vs0,    vs18            // real*real, imag*real
+       xvmaddadp       vs35,   vs0,    vs19            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x1_2
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+
+       addi            AO,     AO,     16
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+       lxvdsx          vs18,   o16,    BO              // load real part from B
+       lxvdsx          vs19,   o24,    BO              // load imag part from B
+
+       addi            BO,     BO,     32
+
+       xvmaddadp       vs32,   vs8,    vs20            // real*real, imag*real
+       xvmaddadp       vs33,   vs8,    vs21            // real*imag, imag*imag
+
+       xvmaddadp       vs34,   vs8,    vs22            // real*real, imag*real
+       xvmaddadp       vs35,   vs8,    vs23            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x1_E2
+
+
+       xvmaddadp       vs32,   vs8,    vs20            // real*real, imag*real
+       xvmaddadp       vs33,   vs8,    vs21            // real*imag, imag*imag
+
+       xvmaddadp       vs34,   vs8,    vs22            // real*real, imag*real
+       xvmaddadp       vs35,   vs8,    vs23            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x1_SUBI1
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+
+       addi            AO,     AO,     16
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+       lxvdsx          vs18,   o16,    BO              // load real part from B
+       lxvdsx          vs19,   o24,    BO              // load imag part from B
+
+       addi            BO,     BO,     32
+
+       xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
+       xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
+
+       xvmuldp         vs34,   vs0,    vs18            // real*real, imag*real
+       xvmuldp         vs35,   vs0,    vs19            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x1_SUB1
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+
+       addi            AO,     AO,     16
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+       lxvdsx          vs18,   o16,    BO              // load real part from B
+       lxvdsx          vs19,   o24,    BO              // load imag part from B
+
+       addi            BO,     BO,     32
+
+       xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
+       xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
+
+       xvmaddadp       vs34,   vs0,    vs18            // real*real, imag*real
+       xvmaddadp       vs35,   vs0,    vs19            // real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE2x1
+
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+
+       lxvd2x          vs16,   o0,     T1
+
+#endif
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs33,   vs33                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs32            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs33            // imagA*imagB
+
+       xxswapd         vs32,   vs32                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs33,   vs33                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs32            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs33            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs8,    vs2,    vs3,    0       // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+       xvadddp         vs8,    vs8,    vs16
+
+#endif
+
+       stxvd2x         vs8,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+#ifndef TRMMKERNEL
+
+       lxvd2x          vs16,   o0,     T1
+
+#endif
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs35,   vs35                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs34            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs35            // imagA*imagB
+
+       xxswapd         vs34,   vs34                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs35,   vs35                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs34            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs35            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs8,    vs2,    vs3,    0       // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+       xvadddp         vs8,    vs8,    vs16
+
+#endif
+
+       stxvd2x         vs8,    o0,     T1
+
+       add             T1,     T1,     LDC
+       addi            CO,     CO,     16
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=8
+**********************************************************************************************/
+
+.macro LOAD1x8_1
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+
+       addi            BO,     BO,     16
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+       lxvd2x          vs1,    o16,    AO              // load real,imag from A
+       lxvd2x          vs2,    o32,    AO              // load real,imag from A
+       lxvd2x          vs3,    o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvd2x          vs4,    o0,     AO              // load real,imag from A
+       lxvd2x          vs5,    o16,    AO              // load real,imag from A
+       lxvd2x          vs6,    o32,    AO              // load real,imag from A
+       lxvd2x          vs7,    o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+
+.endm
+
+.macro KERNEL1x8_I1
+
+       lxvd2x          vs8,    o0,     AO              // load real,imag from A
+       lxvd2x          vs9,    o16,    AO              // load real,imag from A
+       lxvd2x          vs10,   o32,    AO              // load real,imag from A
+       lxvd2x          vs11,   o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvd2x          vs12,   o0,     AO              // load real,imag from A
+       lxvd2x          vs13,   o16,    AO              // load real,imag from A
+       lxvd2x          vs14,   o32,    AO              // load real,imag from A
+       lxvd2x          vs15,   o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvdsx          vs20,   o0,     BO              // load real part from B
+       lxvdsx          vs21,   o8,     BO              // load imag part from B
+
+       addi            BO,     BO,     16
+
+       xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
+       xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
+       xvmuldp         vs34,   vs1,    vs16            // real*real, imag*real
+       xvmuldp         vs35,   vs1,    vs17            // real*imag, imag*imag
+       xvmuldp         vs36,   vs2,    vs16            // real*real, imag*real
+       xvmuldp         vs37,   vs2,    vs17            // real*imag, imag*imag
+       xvmuldp         vs38,   vs3,    vs16            // real*real, imag*real
+       xvmuldp         vs39,   vs3,    vs17            // real*imag, imag*imag
+       xvmuldp         vs40,   vs4,    vs16            // real*real, imag*real
+       xvmuldp         vs41,   vs4,    vs17            // real*imag, imag*imag
+       xvmuldp         vs42,   vs5,    vs16            // real*real, imag*real
+       xvmuldp         vs43,   vs5,    vs17            // real*imag, imag*imag
+       xvmuldp         vs44,   vs6,    vs16            // real*real, imag*real
+       xvmuldp         vs45,   vs6,    vs17            // real*imag, imag*imag
+       xvmuldp         vs46,   vs7,    vs16            // real*real, imag*real
+       xvmuldp         vs47,   vs7,    vs17            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x8_1
+
+       lxvd2x          vs8,    o0,     AO              // load real,imag from A
+       lxvd2x          vs9,    o16,    AO              // load real,imag from A
+       lxvd2x          vs10,   o32,    AO              // load real,imag from A
+       lxvd2x          vs11,   o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvd2x          vs12,   o0,     AO              // load real,imag from A
+       lxvd2x          vs13,   o16,    AO              // load real,imag from A
+       lxvd2x          vs14,   o32,    AO              // load real,imag from A
+       lxvd2x          vs15,   o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvdsx          vs20,   o0,     BO              // load real part from B
+       lxvdsx          vs21,   o8,     BO              // load imag part from B
+
+       addi            BO,     BO,     16
+
+       xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
+       xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs34,   vs1,    vs16            // real*real, imag*real
+       xvmaddadp       vs35,   vs1,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs36,   vs2,    vs16            // real*real, imag*real
+       xvmaddadp       vs37,   vs2,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs38,   vs3,    vs16            // real*real, imag*real
+       xvmaddadp       vs39,   vs3,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs40,   vs4,    vs16            // real*real, imag*real
+       xvmaddadp       vs41,   vs4,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs42,   vs5,    vs16            // real*real, imag*real
+       xvmaddadp       vs43,   vs5,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs44,   vs6,    vs16            // real*real, imag*real
+       xvmaddadp       vs45,   vs6,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs46,   vs7,    vs16            // real*real, imag*real
+       xvmaddadp       vs47,   vs7,    vs17            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x8_2
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+       lxvd2x          vs1,    o16,    AO              // load real,imag from A
+       lxvd2x          vs2,    o32,    AO              // load real,imag from A
+       lxvd2x          vs3,    o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvd2x          vs4,    o0,     AO              // load real,imag from A
+       lxvd2x          vs5,    o16,    AO              // load real,imag from A
+       lxvd2x          vs6,    o32,    AO              // load real,imag from A
+       lxvd2x          vs7,    o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+
+       addi            BO,     BO,     16
+
+       xvmaddadp       vs32,   vs8,    vs20            // real*real, imag*real
+       xvmaddadp       vs33,   vs8,    vs21            // real*imag, imag*imag
+       xvmaddadp       vs34,   vs9,    vs20            // real*real, imag*real
+       xvmaddadp       vs35,   vs9,    vs21            // real*imag, imag*imag
+       xvmaddadp       vs36,   vs10,   vs20            // real*real, imag*real
+       xvmaddadp       vs37,   vs10,   vs21            // real*imag, imag*imag
+       xvmaddadp       vs38,   vs11,   vs20            // real*real, imag*real
+       xvmaddadp       vs39,   vs11,   vs21            // real*imag, imag*imag
+       xvmaddadp       vs40,   vs12,   vs20            // real*real, imag*real
+       xvmaddadp       vs41,   vs12,   vs21            // real*imag, imag*imag
+       xvmaddadp       vs42,   vs13,   vs20            // real*real, imag*real
+       xvmaddadp       vs43,   vs13,   vs21            // real*imag, imag*imag
+       xvmaddadp       vs44,   vs14,   vs20            // real*real, imag*real
+       xvmaddadp       vs45,   vs14,   vs21            // real*imag, imag*imag
+       xvmaddadp       vs46,   vs15,   vs20            // real*real, imag*real
+       xvmaddadp       vs47,   vs15,   vs21            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x8_E2
+
+
+       xvmaddadp       vs32,   vs8,    vs20            // real*real, imag*real
+       xvmaddadp       vs33,   vs8,    vs21            // real*imag, imag*imag
+       xvmaddadp       vs34,   vs9,    vs20            // real*real, imag*real
+       xvmaddadp       vs35,   vs9,    vs21            // real*imag, imag*imag
+       xvmaddadp       vs36,   vs10,   vs20            // real*real, imag*real
+       xvmaddadp       vs37,   vs10,   vs21            // real*imag, imag*imag
+       xvmaddadp       vs38,   vs11,   vs20            // real*real, imag*real
+       xvmaddadp       vs39,   vs11,   vs21            // real*imag, imag*imag
+       xvmaddadp       vs40,   vs12,   vs20            // real*real, imag*real
+       xvmaddadp       vs41,   vs12,   vs21            // real*imag, imag*imag
+       xvmaddadp       vs42,   vs13,   vs20            // real*real, imag*real
+       xvmaddadp       vs43,   vs13,   vs21            // real*imag, imag*imag
+       xvmaddadp       vs44,   vs14,   vs20            // real*real, imag*real
+       xvmaddadp       vs45,   vs14,   vs21            // real*imag, imag*imag
+       xvmaddadp       vs46,   vs15,   vs20            // real*real, imag*real
+       xvmaddadp       vs47,   vs15,   vs21            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x8_SUBI1
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+       lxvd2x          vs1,    o16,    AO              // load real,imag from A
+       lxvd2x          vs2,    o32,    AO              // load real,imag from A
+       lxvd2x          vs3,    o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvd2x          vs4,    o0,     AO              // load real,imag from A
+       lxvd2x          vs5,    o16,    AO              // load real,imag from A
+       lxvd2x          vs6,    o32,    AO              // load real,imag from A
+       lxvd2x          vs7,    o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+
+       addi            BO,     BO,     16
+
+       xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
+       xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
+       xvmuldp         vs34,   vs1,    vs16            // real*real, imag*real
+       xvmuldp         vs35,   vs1,    vs17            // real*imag, imag*imag
+       xvmuldp         vs36,   vs2,    vs16            // real*real, imag*real
+       xvmuldp         vs37,   vs2,    vs17            // real*imag, imag*imag
+       xvmuldp         vs38,   vs3,    vs16            // real*real, imag*real
+       xvmuldp         vs39,   vs3,    vs17            // real*imag, imag*imag
+       xvmuldp         vs40,   vs4,    vs16            // real*real, imag*real
+       xvmuldp         vs41,   vs4,    vs17            // real*imag, imag*imag
+       xvmuldp         vs42,   vs5,    vs16            // real*real, imag*real
+       xvmuldp         vs43,   vs5,    vs17            // real*imag, imag*imag
+       xvmuldp         vs44,   vs6,    vs16            // real*real, imag*real
+       xvmuldp         vs45,   vs6,    vs17            // real*imag, imag*imag
+       xvmuldp         vs46,   vs7,    vs16            // real*real, imag*real
+       xvmuldp         vs47,   vs7,    vs17            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x8_SUB1
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+       lxvd2x          vs1,    o16,    AO              // load real,imag from A
+       lxvd2x          vs2,    o32,    AO              // load real,imag from A
+       lxvd2x          vs3,    o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvd2x          vs4,    o0,     AO              // load real,imag from A
+       lxvd2x          vs5,    o16,    AO              // load real,imag from A
+       lxvd2x          vs6,    o32,    AO              // load real,imag from A
+       lxvd2x          vs7,    o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+
+       addi            BO,     BO,     16
+
+       xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
+       xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs34,   vs1,    vs16            // real*real, imag*real
+       xvmaddadp       vs35,   vs1,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs36,   vs2,    vs16            // real*real, imag*real
+       xvmaddadp       vs37,   vs2,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs38,   vs3,    vs16            // real*real, imag*real
+       xvmaddadp       vs39,   vs3,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs40,   vs4,    vs16            // real*real, imag*real
+       xvmaddadp       vs41,   vs4,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs42,   vs5,    vs16            // real*real, imag*real
+       xvmaddadp       vs43,   vs5,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs44,   vs6,    vs16            // real*real, imag*real
+       xvmaddadp       vs45,   vs6,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs46,   vs7,    vs16            // real*real, imag*real
+       xvmaddadp       vs47,   vs7,    vs17            // real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE1x8
+
+
+       mr              T1,     CO
+       addi            T2,     T1,     64
+
+#ifndef TRMMKERNEL
+
+       lxvd2x          vs16,   o0,     T1
+       lxvd2x          vs17,   o16,    T1
+       lxvd2x          vs18,   o32,    T1
+       lxvd2x          vs19,   o48,    T1
+       lxvd2x          vs20,   o0,     T2
+       lxvd2x          vs21,   o16,    T2
+       lxvd2x          vs22,   o32,    T2
+       lxvd2x          vs23,   o48,    T2
+
+#endif
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs33,   vs33                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs32            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs33            // imagA*imagB
+
+       xxswapd         vs32,   vs32                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs33,   vs33                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs32            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs33            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs8,    vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs35,   vs35                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs34            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs35            // imagA*imagB
+
+       xxswapd         vs34,   vs34                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs35,   vs35                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs34            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs35            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs9,    vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs37,   vs37                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs36            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs37            // imagA*imagB
+
+       xxswapd         vs36,   vs36                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs37,   vs37                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs36            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs37            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs10,   vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs39,   vs39                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs38            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs39            // imagA*imagB
+
+       xxswapd         vs38,   vs38                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs39,   vs39                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs38            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs39            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs11,   vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs41,   vs41                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs40            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs41            // imagA*imagB
+
+       xxswapd         vs40,   vs40                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs41,   vs41                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs40            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs41            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs12,   vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs43,   vs43                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs42            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs43            // imagA*imagB
+
+       xxswapd         vs42,   vs42                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs43,   vs43                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs42            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs43            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs13,   vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs45,   vs45                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs44            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs45            // imagA*imagB
+
+       xxswapd         vs44,   vs44                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs45,   vs45                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs44            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs45            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs14,   vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs47,   vs47                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs46            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs47            // imagA*imagB
+
+       xxswapd         vs46,   vs46                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs47,   vs47                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs46            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs47            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs15,   vs2,    vs3,    0       // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+       xvadddp         vs8,    vs8,    vs16
+       xvadddp         vs9,    vs9,    vs17
+       xvadddp         vs10,   vs10,   vs18
+       xvadddp         vs11,   vs11,   vs19
+       xvadddp         vs12,   vs12,   vs20
+       xvadddp         vs13,   vs13,   vs21
+       xvadddp         vs14,   vs14,   vs22
+       xvadddp         vs15,   vs15,   vs23
+
+#endif
+
+       stxvd2x         vs8,    o0,     T1
+       stxvd2x         vs9,    o16,    T1
+       stxvd2x         vs10,   o32,    T1
+       stxvd2x         vs11,   o48,    T1
+       stxvd2x         vs12,   o0,     T2
+       stxvd2x         vs13,   o16,    T2
+       stxvd2x         vs14,   o32,    T2
+       stxvd2x         vs15,   o48,    T2
+
+       add             T1,     T1,     LDC
+       add             T2,     T2,     LDC
+       addi            CO,     CO,     128
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=4
+**********************************************************************************************/
+
+.macro LOAD1x4_1
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+
+       addi            BO,     BO,     16
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+       lxvd2x          vs1,    o16,    AO              // load real,imag from A
+       lxvd2x          vs2,    o32,    AO              // load real,imag from A
+       lxvd2x          vs3,    o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+
+.endm
+
+.macro KERNEL1x4_I1
+
+       lxvd2x          vs8,    o0,     AO              // load real,imag from A
+       lxvd2x          vs9,    o16,    AO              // load real,imag from A
+       lxvd2x          vs10,   o32,    AO              // load real,imag from A
+       lxvd2x          vs11,   o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvdsx          vs20,   o0,     BO              // load real part from B
+       lxvdsx          vs21,   o8,     BO              // load imag part from B
+
+       addi            BO,     BO,     16
+
+       xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
+       xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
+       xvmuldp         vs34,   vs1,    vs16            // real*real, imag*real
+       xvmuldp         vs35,   vs1,    vs17            // real*imag, imag*imag
+       xvmuldp         vs36,   vs2,    vs16            // real*real, imag*real
+       xvmuldp         vs37,   vs2,    vs17            // real*imag, imag*imag
+       xvmuldp         vs38,   vs3,    vs16            // real*real, imag*real
+       xvmuldp         vs39,   vs3,    vs17            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x4_1
+
+       lxvd2x          vs8,    o0,     AO              // load real,imag from A
+       lxvd2x          vs9,    o16,    AO              // load real,imag from A
+       lxvd2x          vs10,   o32,    AO              // load real,imag from A
+       lxvd2x          vs11,   o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvdsx          vs20,   o0,     BO              // load real part from B
+       lxvdsx          vs21,   o8,     BO              // load imag part from B
+
+       addi            BO,     BO,     16
+
+       xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
+       xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs34,   vs1,    vs16            // real*real, imag*real
+       xvmaddadp       vs35,   vs1,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs36,   vs2,    vs16            // real*real, imag*real
+       xvmaddadp       vs37,   vs2,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs38,   vs3,    vs16            // real*real, imag*real
+       xvmaddadp       vs39,   vs3,    vs17            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x4_2
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+       lxvd2x          vs1,    o16,    AO              // load real,imag from A
+       lxvd2x          vs2,    o32,    AO              // load real,imag from A
+       lxvd2x          vs3,    o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+
+       addi            BO,     BO,     16
+
+       xvmaddadp       vs32,   vs8,    vs20            // real*real, imag*real
+       xvmaddadp       vs33,   vs8,    vs21            // real*imag, imag*imag
+       xvmaddadp       vs34,   vs9,    vs20            // real*real, imag*real
+       xvmaddadp       vs35,   vs9,    vs21            // real*imag, imag*imag
+       xvmaddadp       vs36,   vs10,   vs20            // real*real, imag*real
+       xvmaddadp       vs37,   vs10,   vs21            // real*imag, imag*imag
+       xvmaddadp       vs38,   vs11,   vs20            // real*real, imag*real
+       xvmaddadp       vs39,   vs11,   vs21            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x4_E2
+
+
+       xvmaddadp       vs32,   vs8,    vs20            // real*real, imag*real
+       xvmaddadp       vs33,   vs8,    vs21            // real*imag, imag*imag
+       xvmaddadp       vs34,   vs9,    vs20            // real*real, imag*real
+       xvmaddadp       vs35,   vs9,    vs21            // real*imag, imag*imag
+       xvmaddadp       vs36,   vs10,   vs20            // real*real, imag*real
+       xvmaddadp       vs37,   vs10,   vs21            // real*imag, imag*imag
+       xvmaddadp       vs38,   vs11,   vs20            // real*real, imag*real
+       xvmaddadp       vs39,   vs11,   vs21            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x4_SUBI1
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+       lxvd2x          vs1,    o16,    AO              // load real,imag from A
+       lxvd2x          vs2,    o32,    AO              // load real,imag from A
+       lxvd2x          vs3,    o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+
+       addi            BO,     BO,     16
+
+       xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
+       xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
+       xvmuldp         vs34,   vs1,    vs16            // real*real, imag*real
+       xvmuldp         vs35,   vs1,    vs17            // real*imag, imag*imag
+       xvmuldp         vs36,   vs2,    vs16            // real*real, imag*real
+       xvmuldp         vs37,   vs2,    vs17            // real*imag, imag*imag
+       xvmuldp         vs38,   vs3,    vs16            // real*real, imag*real
+       xvmuldp         vs39,   vs3,    vs17            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x4_SUB1
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+       lxvd2x          vs1,    o16,    AO              // load real,imag from A
+       lxvd2x          vs2,    o32,    AO              // load real,imag from A
+       lxvd2x          vs3,    o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+
+       addi            BO,     BO,     16
+
+       xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
+       xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs34,   vs1,    vs16            // real*real, imag*real
+       xvmaddadp       vs35,   vs1,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs36,   vs2,    vs16            // real*real, imag*real
+       xvmaddadp       vs37,   vs2,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs38,   vs3,    vs16            // real*real, imag*real
+       xvmaddadp       vs39,   vs3,    vs17            // real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE1x4
+
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+
+       lxvd2x          vs16,   o0,     T1
+       lxvd2x          vs17,   o16,    T1
+       lxvd2x          vs18,   o32,    T1
+       lxvd2x          vs19,   o48,    T1
+
+#endif
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs33,   vs33                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs32            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs33            // imagA*imagB
+
+       xxswapd         vs32,   vs32                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs33,   vs33                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs32            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs33            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs8,    vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs35,   vs35                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs34            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs35            // imagA*imagB
+
+       xxswapd         vs34,   vs34                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs35,   vs35                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs34            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs35            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs9,    vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs37,   vs37                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs36            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs37            // imagA*imagB
+
+       xxswapd         vs36,   vs36                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs37,   vs37                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs36            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs37            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs10,   vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs39,   vs39                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs38            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs39            // imagA*imagB
+
+       xxswapd         vs38,   vs38                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs39,   vs39                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs38            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs39            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs11,   vs2,    vs3,    0       // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+       xvadddp         vs8,    vs8,    vs16
+       xvadddp         vs9,    vs9,    vs17
+       xvadddp         vs10,   vs10,   vs18
+       xvadddp         vs11,   vs11,   vs19
+
+#endif
+
+       stxvd2x         vs8,    o0,     T1
+       stxvd2x         vs9,    o16,    T1
+       stxvd2x         vs10,   o32,    T1
+       stxvd2x         vs11,   o48,    T1
+
+       add             T1,     T1,     LDC
+       addi            CO,     CO,     64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=2
+**********************************************************************************************/
+
+.macro LOAD1x2_1
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+
+       addi            BO,     BO,     16
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+       lxvd2x          vs1,    o16,    AO              // load real,imag from A
+
+       addi            AO,     AO,     32
+
+
+.endm
+
+.macro KERNEL1x2_I1
+
+       lxvd2x          vs8,    o0,     AO              // load real,imag from A
+       lxvd2x          vs9,    o16,    AO              // load real,imag from A
+
+       addi            AO,     AO,     32
+
+       lxvdsx          vs20,   o0,     BO              // load real part from B
+       lxvdsx          vs21,   o8,     BO              // load imag part from B
+
+       addi            BO,     BO,     16
+
+       xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
+       xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
+       xvmuldp         vs34,   vs1,    vs16            // real*real, imag*real
+       xvmuldp         vs35,   vs1,    vs17            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x2_1
+
+       lxvd2x          vs8,    o0,     AO              // load real,imag from A
+       lxvd2x          vs9,    o16,    AO              // load real,imag from A
+
+       addi            AO,     AO,     32
+
+       lxvdsx          vs20,   o0,     BO              // load real part from B
+       lxvdsx          vs21,   o8,     BO              // load imag part from B
+
+       addi            BO,     BO,     16
+
+       xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
+       xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs34,   vs1,    vs16            // real*real, imag*real
+       xvmaddadp       vs35,   vs1,    vs17            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x2_2
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+       lxvd2x          vs1,    o16,    AO              // load real,imag from A
+
+       addi            AO,     AO,     32
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+
+       addi            BO,     BO,     16
+
+       xvmaddadp       vs32,   vs8,    vs20            // real*real, imag*real
+       xvmaddadp       vs33,   vs8,    vs21            // real*imag, imag*imag
+       xvmaddadp       vs34,   vs9,    vs20            // real*real, imag*real
+       xvmaddadp       vs35,   vs9,    vs21            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x2_E2
+
+
+       xvmaddadp       vs32,   vs8,    vs20            // real*real, imag*real
+       xvmaddadp       vs33,   vs8,    vs21            // real*imag, imag*imag
+       xvmaddadp       vs34,   vs9,    vs20            // real*real, imag*real
+       xvmaddadp       vs35,   vs9,    vs21            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x2_SUBI1
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+       lxvd2x          vs1,    o16,    AO              // load real,imag from A
+
+       addi            AO,     AO,     32
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+
+       addi            BO,     BO,     16
+
+       xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
+       xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
+       xvmuldp         vs34,   vs1,    vs16            // real*real, imag*real
+       xvmuldp         vs35,   vs1,    vs17            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x2_SUB1
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+       lxvd2x          vs1,    o16,    AO              // load real,imag from A
+
+       addi            AO,     AO,     32
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+
+       addi            BO,     BO,     16
+
+       xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
+       xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs34,   vs1,    vs16            // real*real, imag*real
+       xvmaddadp       vs35,   vs1,    vs17            // real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE1x2
+
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+
+       lxvd2x          vs16,   o0,     T1
+       lxvd2x          vs17,   o16,    T1
+
+#endif
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs33,   vs33                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs32            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs33            // imagA*imagB
+
+       xxswapd         vs32,   vs32                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs33,   vs33                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs32            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs33            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs8,    vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs35,   vs35                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs34            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs35            // imagA*imagB
+
+       xxswapd         vs34,   vs34                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs35,   vs35                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs34            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs35            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs9,    vs2,    vs3,    0       // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+       xvadddp         vs8,    vs8,    vs16
+       xvadddp         vs9,    vs9,    vs17
+
+#endif
+
+       stxvd2x         vs8,    o0,     T1
+       stxvd2x         vs9,    o16,    T1
+
+       add             T1,     T1,     LDC
+       addi            CO,     CO,     32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=1
+**********************************************************************************************/
+
+.macro LOAD1x1_1
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+
+       addi            BO,     BO,     16
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+
+       addi            AO,     AO,     16
+
+
+.endm
+
+.macro KERNEL1x1_I1
+
+       lxvd2x          vs8,    o0,     AO              // load real,imag from A
+
+       addi            AO,     AO,     16
+
+       lxvdsx          vs20,   o0,     BO              // load real part from B
+       lxvdsx          vs21,   o8,     BO              // load imag part from B
+
+       addi            BO,     BO,     16
+
+       xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
+       xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x1_1
+
+       lxvd2x          vs8,    o0,     AO              // load real,imag from A
+
+       addi            AO,     AO,     16
+
+       lxvdsx          vs20,   o0,     BO              // load real part from B
+       lxvdsx          vs21,   o8,     BO              // load imag part from B
+
+       addi            BO,     BO,     16
+
+       xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
+       xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x1_2
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+
+       addi            AO,     AO,     16
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+
+       addi            BO,     BO,     16
+
+       xvmaddadp       vs32,   vs8,    vs20            // real*real, imag*real
+       xvmaddadp       vs33,   vs8,    vs21            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x1_E2
+
+
+       xvmaddadp       vs32,   vs8,    vs20            // real*real, imag*real
+       xvmaddadp       vs33,   vs8,    vs21            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x1_SUBI1
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+
+       addi            AO,     AO,     16
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+
+       addi            BO,     BO,     16
+
+       xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
+       xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x1_SUB1
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+
+       addi            AO,     AO,     16
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+
+       addi            BO,     BO,     16
+
+       xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
+       xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE1x1
+
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+
+       lxvd2x          vs16,   o0,     T1
+
+#endif
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs33,   vs33                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs32            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs33            // imagA*imagB
+
+       xxswapd         vs32,   vs32                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs33,   vs33                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs32            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs33            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs8,    vs2,    vs3,    0       // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+       xvadddp         vs8,    vs8,    vs16
+
+#endif
+
+       stxvd2x         vs8,    o0,     T1
+
+       add             T1,     T1,     LDC
+       addi            CO,     CO,     16
+
+.endm
+
index f934399..23e0177 100644 (file)
 #define PREFETCHSIZE_C  24
 #endif
 
-#ifdef POWER8
-#define PREFETCHSIZE_A  24
-#define PREFETCHSIZE_C  24
-#endif
-
 #ifndef XCONJ
 #define FMADDR FMADD
 #define FMSUBR FNMSUB
index 2b45014..c0bad31 100644 (file)
 #define PREFETCHSIZE_C   8
 #endif
 
-#ifdef POWER8
-#define PREFETCHSIZE_A  24
-#define PREFETCHSIZE_C   8
-#endif
-
 #if !(defined(CONJ) && defined(XCONJ))
 #define FMADDR FMADD
 #define FMSUBR FNMSUB
index 394c030..b348e32 100644 (file)
 #define PREFETCHSIZE_A  112
 #endif
 
-#ifdef POWER8
-#define PREFETCHSIZE_A  112
-#endif
-
-#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970) || defined(POWER8)
+#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970)
 #define NOP1
 #define NOP2
 #else
index a061cd7..b631cbe 100644 (file)
 #define PREFETCHSIZE_A  112
 #endif
 
-#ifdef POWER8
-#define PREFETCHSIZE_A  112
-#endif
-
-#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970) || defined(POWER8)
+#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970)
 #define NOP1
 #define NOP2
 #else
diff --git a/kernel/power/ztrmm_kernel_8x2_power8.S b/kernel/power/ztrmm_kernel_8x2_power8.S
new file mode 100644 (file)
index 0000000..dbbc8f9
--- /dev/null
@@ -0,0 +1,342 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+#ifndef __64BIT__
+#define LOAD   lwz
+#else
+#define LOAD   ld
+#endif
+
+#ifdef __64BIT__
+#define STACKSIZE 320
+#define ALPHA_R_SP 296(SP)
+#define ALPHA_I_SP 304(SP)
+#define FZERO  312(SP)
+#else
+#define STACKSIZE 256
+#define ALPHA_R_SP 224(SP)
+#define ALPHA_I_SP 232(SP)
+#define FZERO  240(SP)
+#endif
+
+#define        M       r3
+#define        N       r4
+#define        K       r5
+
+#ifdef linux
+#ifndef __64BIT__
+#define A      r6
+#define        B       r7
+#define        C       r8
+#define        LDC     r9
+#define OFFSET r10
+#else
+#define A      r8
+#define        B       r9
+#define        C       r10
+#define        LDC     r6
+#define OFFSET r7
+#endif
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#if !defined(__64BIT__) && defined(DOUBLE)
+#define A      r10
+#define        B       r6
+#define        C       r7
+#define        LDC     r8
+#define OFFSET r9
+#else
+#define A      r8
+#define        B       r9
+#define        C       r10
+#define        LDC     r6
+#define OFFSET r7
+#endif
+#endif
+
+#define o0     0
+#define alpha_r vs30
+#define alpha_i vs31
+
+#define KKK    r13
+#define K1     r14
+#define L      r15
+#define ALPHA  r16
+#define o24    r17
+#define T2     r19
+#define KK     r20
+#define        o8      r21
+#define        I       r22
+#define J      r23
+#define AO     r24
+#define        BO      r25
+#define        CO      r26
+#define o16    r27
+#define        o32     r28
+#define o48    r29
+
+#define PRE    r30
+#define T1     r31
+
+#ifndef NEEDPARAM
+
+       PROLOGUE
+       PROFCODE
+
+       addi    SP, SP, -STACKSIZE
+       li      r0, 0
+
+       stfd    f14,    0(SP)
+       stfd    f15,    8(SP)
+       stfd    f16,   16(SP)
+       stfd    f17,   24(SP)
+
+       stfd    f18,   32(SP)
+       stfd    f19,   40(SP)
+       stfd    f20,   48(SP)
+       stfd    f21,   56(SP)
+
+       stfd    f22,   64(SP)
+       stfd    f23,   72(SP)
+       stfd    f24,   80(SP)
+       stfd    f25,   88(SP)
+
+       stfd    f26,   96(SP)
+       stfd    f27,  104(SP)
+       stfd    f28,  112(SP)
+       stfd    f29,  120(SP)
+
+       stfd    f30,  128(SP)
+       stfd    f31,  136(SP)
+
+#ifdef __64BIT__
+       std     r31,  144(SP)
+       std     r30,  152(SP)
+       std     r29,  160(SP)
+       std     r28,  168(SP)
+       std     r27,  176(SP)
+       std     r26,  184(SP)
+       std     r25,  192(SP)
+       std     r24,  200(SP)
+       std     r23,  208(SP)
+       std     r22,  216(SP)
+       std     r21,  224(SP)
+       std     r20,  232(SP)
+       std     r19,  240(SP)
+       std     r18,  248(SP)
+       std     r17,  256(SP)
+       std     r16,  264(SP)
+       std     r15,  272(SP)
+       std     r14,  280(SP)
+       std     r13,  288(SP)
+#else
+       stw     r31,  144(SP)
+       stw     r30,  148(SP)
+       stw     r29,  152(SP)
+       stw     r28,  156(SP)
+       stw     r27,  160(SP)
+       stw     r26,  164(SP)
+       stw     r25,  168(SP)
+       stw     r24,  172(SP)
+       stw     r23,  176(SP)
+       stw     r22,  180(SP)
+       stw     r21,  184(SP)
+       stw     r20,  188(SP)
+       stw     r19,  192(SP)
+       stw     r18,  196(SP)
+       stw     r17,  200(SP)
+       stw     r16,  204(SP)
+       stw     r15,  208(SP)
+       stw     r14,  212(SP)
+       stw     r13,  216(SP)
+#endif
+
+       stfd    f1,  ALPHA_R_SP
+       stfd    f2,  ALPHA_I_SP
+       stw     r0,  FZERO
+
+#ifdef linux
+#ifdef __64BIT__
+       ld      LDC, FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#ifdef __64BIT__
+       ld      LDC, FRAMESLOT(0) + STACKSIZE(SP)
+#else
+#ifdef DOUBLE
+       lwz     B,   FRAMESLOT(0) + STACKSIZE(SP)
+       lwz     C,   FRAMESLOT(1) + STACKSIZE(SP)
+       lwz     LDC, FRAMESLOT(2) + STACKSIZE(SP)
+#else
+       lwz     LDC, FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+#endif
+#endif
+
+#ifdef TRMMKERNEL
+#if defined(linux) && defined(__64BIT__)
+       ld      OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#ifdef __64BIT__
+       ld      OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
+#else
+#ifdef DOUBLE
+       lwz     OFFSET,  FRAMESLOT(3) + STACKSIZE(SP)
+#else
+       lwz     OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
+#endif
+#endif
+#endif
+#if defined(TRMMKERNEL) && !defined(LEFT)
+       neg     KK, OFFSET
+#endif
+#endif
+
+#include "zgemm_macros_8x2_power8.S"
+
+       cmpwi   cr0, M, 0
+       ble     L999
+       cmpwi   cr0, N, 0
+       ble     L999
+       cmpwi   cr0, K, 0
+       ble     L999
+
+       slwi    LDC, LDC, ZBASE_SHIFT
+       li      PRE, 256 
+       li      o8  , 8
+       li      o16 , 16
+       li      o24 , 24
+       li      o32 , 32
+       li      o48 , 48
+
+#ifdef __64BIT__
+       addi    ALPHA, SP, 296
+#else
+       addi    ALPHA, SP, 224
+#endif
+
+       lxsdx   alpha_r, 0, ALPHA
+       lxsdx   alpha_i, o8, ALPHA
+
+       .align 4
+
+#include "ztrmm_logic_8x2_power8.S"
+
+L999:
+       addi    r3, 0, 0
+
+       lfd     f14,    0(SP)
+       lfd     f15,    8(SP)
+       lfd     f16,   16(SP)
+       lfd     f17,   24(SP)
+
+       lfd     f18,   32(SP)
+       lfd     f19,   40(SP)
+       lfd     f20,   48(SP)
+       lfd     f21,   56(SP)
+
+       lfd     f22,   64(SP)
+       lfd     f23,   72(SP)
+       lfd     f24,   80(SP)
+       lfd     f25,   88(SP)
+
+       lfd     f26,   96(SP)
+       lfd     f27,  104(SP)
+       lfd     f28,  112(SP)
+       lfd     f29,  120(SP)
+
+       lfd     f30,  128(SP)
+       lfd     f31,  136(SP)
+
+#ifdef __64BIT__
+       ld      r31,  144(SP)
+       ld      r30,  152(SP)
+       ld      r29,  160(SP)
+       ld      r28,  168(SP)
+       ld      r27,  176(SP)
+       ld      r26,  184(SP)
+       ld      r25,  192(SP)
+       ld      r24,  200(SP)
+       ld      r23,  208(SP)
+       ld      r22,  216(SP)
+       ld      r21,  224(SP)
+       ld      r20,  232(SP)
+       ld      r19,  240(SP)
+       ld      r18,  248(SP)
+       ld      r17,  256(SP)
+       ld      r16,  264(SP)
+       ld      r15,  272(SP)
+       ld      r14,  280(SP)
+       ld      r13,  288(SP)
+#else
+       lwz     r31,  144(SP)
+       lwz     r30,  148(SP)
+       lwz     r29,  152(SP)
+       lwz     r28,  156(SP)
+       lwz     r27,  160(SP)
+       lwz     r26,  164(SP)
+       lwz     r25,  168(SP)
+       lwz     r24,  172(SP)
+       lwz     r23,  176(SP)
+       lwz     r22,  180(SP)
+       lwz     r21,  184(SP)
+       lwz     r20,  188(SP)
+       lwz     r19,  192(SP)
+       lwz     r18,  196(SP)
+       lwz     r17,  200(SP)
+       lwz     r16,  204(SP)
+       lwz     r15,  208(SP)
+       lwz     r14,  212(SP)
+       lwz     r13,  216(SP)
+#endif
+
+       addi    SP, SP, STACKSIZE
+
+       blr
+
+       EPILOGUE
+#endif
diff --git a/kernel/power/ztrmm_logic_8x2_power8.S b/kernel/power/ztrmm_logic_8x2_power8.S
new file mode 100644 (file)
index 0000000..e250dfa
--- /dev/null
@@ -0,0 +1,1201 @@
+       srawi.          J,      N,      1
+       ble             ZTRMM_L2_END
+
+ZTRMM_L2_BEGIN:
+
+       mr              CO,     C
+       mr              AO,     A
+       slwi            T1,     LDC     ,       1
+       add             C,      C,      T1
+
+#if defined(LEFT)
+       mr              KK,     OFFSET          // OFFSET -> KK
+#endif
+
+       srawi.          I,      M,      3
+       ble             ZTRMM_L2x8_END
+
+ZTRMM_L2x8_BEGIN:
+
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       mr              BO,     B                                       // B -> BO
+#else
+       mr              BO,     B                                       // B -> BO
+       slwi            T1,     KK,     5                               // Number of values in B shifted
+       slwi            T2,     KK,     7                               // Number of values in A shifted
+       add             BO,     BO,     T1                              // Add values to BO
+       add             AO,     AO,     T2                              // Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       sub             T1,     K,      KK                              // K - KK -> TEMP1
+#else
+       mr              T1,     KK                                      // KK -> KTEMP
+#ifdef LEFT
+       addi            T1,     T1,     8                               // KTEMP + Number of values in A -> KTEMP
+#else
+       addi            T1,     T1,     2                               // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+       mr              KKK,    T1
+       mr              K1,     T1
+       srawi.          L,      K1,     3                               // KTEMP / 8 -> L
+       ble             ZTRMM_L2x8_SUB0
+       cmpwi           cr0,    L,      1
+       ble             ZTRMM_L2x8_SUB4
+
+ZTRMM_L2x8_LOOP_START:
+
+       dcbt            AO,     PRE
+       LOAD2x8_1
+       dcbt            AO,     PRE
+       KERNEL2x8_I1
+       dcbt            AO,     PRE
+       KERNEL2x8_2
+       dcbt            AO,     PRE
+       KERNEL2x8_1
+       dcbt            AO,     PRE
+       KERNEL2x8_2
+
+       dcbt            AO,     PRE
+       KERNEL2x8_1
+       dcbt            AO,     PRE
+       KERNEL2x8_2
+       dcbt            AO,     PRE
+       KERNEL2x8_1
+       dcbt            AO,     PRE
+       KERNEL2x8_2
+
+       addic.          L,      L,      -2
+       ble             ZTRMM_L2x8_LOOP_END
+
+       .align 5
+
+ZTRMM_L2x8_LOOP:
+
+       dcbt            AO,     PRE
+       KERNEL2x8_1
+       dcbt            AO,     PRE
+       KERNEL2x8_2
+       dcbt            AO,     PRE
+       KERNEL2x8_1
+       dcbt            AO,     PRE
+       KERNEL2x8_2
+
+       dcbt            AO,     PRE
+       KERNEL2x8_1
+       dcbt            AO,     PRE
+       KERNEL2x8_2
+       dcbt            AO,     PRE
+       KERNEL2x8_1
+       dcbt            AO,     PRE
+       KERNEL2x8_2
+
+       addic.          L,      L,      -1
+       bgt             ZTRMM_L2x8_LOOP
+
+ZTRMM_L2x8_LOOP_END:
+
+       dcbt            AO,     PRE
+       KERNEL2x8_1
+       dcbt            AO,     PRE
+       KERNEL2x8_2
+       dcbt            AO,     PRE
+       KERNEL2x8_1
+       dcbt            AO,     PRE
+       KERNEL2x8_2
+
+       dcbt            AO,     PRE
+       KERNEL2x8_1
+       dcbt            AO,     PRE
+       KERNEL2x8_2
+       dcbt            AO,     PRE
+       KERNEL2x8_1
+       KERNEL2x8_E2
+
+       b               ZTRMM_L2x8_SUB1
+
+ZTRMM_L2x8_SUB4:
+
+       dcbt            AO,     PRE
+       KERNEL2x8_SUBI1
+       dcbt            AO,     PRE
+       KERNEL2x8_SUB1
+       dcbt            AO,     PRE
+       KERNEL2x8_SUB1
+       dcbt            AO,     PRE
+       KERNEL2x8_SUB1
+
+       KERNEL2x8_SUB1
+       KERNEL2x8_SUB1
+       KERNEL2x8_SUB1
+       KERNEL2x8_SUB1
+
+       b               ZTRMM_L2x8_SUB1
+
+ZTRMM_L2x8_SUB0:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+
+       KERNEL2x8_SUBI1
+
+       addic.          L,      L,      -1
+       ble             ZTRMM_L2x8_SAVE
+       b               ZTRMM_L2x8_SUB2
+
+ZTRMM_L2x8_SUB1:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+       ble             ZTRMM_L2x8_SAVE
+
+ZTRMM_L2x8_SUB2:
+
+       KERNEL2x8_SUB1
+
+       addic.          L,      L,      -1
+       bgt             ZTRMM_L2x8_SUB2
+
+ZTRMM_L2x8_SAVE:
+
+       SAVE2x8
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
+       slwi            T2,     T1,     5                       // TEMP1 * Number of values in B shifted -> TEMP2
+       slwi            T1,     T1,     7                       // TEMP1 * Number of values in A shifted -> TEMP1
+       add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
+       add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+       addi            KK,     KK,     8                               // KK += Number of values in A
+#endif
+
+
+       addic.          I,      I,      -1
+       bgt             ZTRMM_L2x8_BEGIN
+
+ZTRMM_L2x8_END:
+
+ZTRMM_L2x4_BEGIN:
+       andi.           T2,     M,      7
+       ble             ZTRMM_L2x1_END
+
+       andi.           T1,     M,      4
+       ble             ZTRMM_L2x4_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       mr              BO,     B                                       // B -> BO
+#else
+       mr              BO,     B                                       // B -> BO
+       slwi            T1,     KK,     5                               // Number of values in B shifted
+       slwi            T2,     KK,     6                               // Number of values in A shifted
+       add             BO,     BO,     T1                              // Add values to BO
+       add             AO,     AO,     T2                              // Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       sub             T1,     K,      KK                              // K - KK -> TEMP1
+#else
+       mr              T1,     KK                                      // KK -> KTEMP
+#ifdef LEFT
+       addi            T1,     T1,     4                               // KTEMP + Number of values in A -> KTEMP
+#else
+       addi            T1,     T1,     2                               // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+       mr              KKK,    T1
+       mr              K1,     T1
+       srawi.          L,      K1,     3                               // KTEMP / 8 -> L
+       ble             ZTRMM_L2x4_SUB0
+       cmpwi           cr0,    L,      1
+       ble             ZTRMM_L2x4_SUB4
+
+ZTRMM_L2x4_LOOP_START:
+
+       LOAD2x4_1
+       KERNEL2x4_I1
+       KERNEL2x4_2
+       KERNEL2x4_1
+       KERNEL2x4_2
+
+       KERNEL2x4_1
+       KERNEL2x4_2
+       KERNEL2x4_1
+       KERNEL2x4_2
+
+       addic.          L,      L,      -2
+       ble             ZTRMM_L2x4_LOOP_END
+
+       .align 5
+
+ZTRMM_L2x4_LOOP:
+
+       KERNEL2x4_1
+       KERNEL2x4_2
+       KERNEL2x4_1
+       KERNEL2x4_2
+
+       KERNEL2x4_1
+       KERNEL2x4_2
+       KERNEL2x4_1
+       KERNEL2x4_2
+
+       addic.          L,      L,      -1
+       bgt             ZTRMM_L2x4_LOOP
+
+ZTRMM_L2x4_LOOP_END:
+
+       KERNEL2x4_1
+       KERNEL2x4_2
+       KERNEL2x4_1
+       KERNEL2x4_2
+
+       KERNEL2x4_1
+       KERNEL2x4_2
+       KERNEL2x4_1
+       KERNEL2x4_E2
+
+       b               ZTRMM_L2x4_SUB1
+
+ZTRMM_L2x4_SUB4:
+
+       KERNEL2x4_SUBI1
+       KERNEL2x4_SUB1
+       KERNEL2x4_SUB1
+       KERNEL2x4_SUB1
+
+       KERNEL2x4_SUB1
+       KERNEL2x4_SUB1
+       KERNEL2x4_SUB1
+       KERNEL2x4_SUB1
+
+       b               ZTRMM_L2x4_SUB1
+
+ZTRMM_L2x4_SUB0:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+
+       KERNEL2x4_SUBI1
+
+       addic.          L,      L,      -1
+       ble             ZTRMM_L2x4_SAVE
+       b               ZTRMM_L2x4_SUB2
+
+ZTRMM_L2x4_SUB1:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+       ble             ZTRMM_L2x4_SAVE
+
+ZTRMM_L2x4_SUB2:
+
+       KERNEL2x4_SUB1
+
+       addic.          L,      L,      -1
+       bgt             ZTRMM_L2x4_SUB2
+
+ZTRMM_L2x4_SAVE:
+
+       SAVE2x4
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
+       slwi            T2,     T1,     5                       // TEMP1 * Number of values in B shifted -> TEMP2
+       slwi            T1,     T1,     6                       // TEMP1 * Number of values in A shifted -> TEMP1
+       add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
+       add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+       addi            KK,     KK,     4                               // KK += Number of values in A
+#endif
+
+
+ZTRMM_L2x4_END:
+
+ZTRMM_L2x2_BEGIN:
+
+       andi.           T1,     M,      2
+       ble             ZTRMM_L2x2_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       mr              BO,     B                                       // B -> BO
+#else
+       mr              BO,     B                                       // B -> BO
+       slwi            T1,     KK,     5                               // Number of values in B shifted
+       slwi            T2,     KK,     5                               // Number of values in A shifted
+       add             BO,     BO,     T1                              // Add values to BO
+       add             AO,     AO,     T2                              // Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       sub             T1,     K,      KK                              // K - KK -> TEMP1
+#else
+       mr              T1,     KK                                      // KK -> KTEMP
+#ifdef LEFT
+       addi            T1,     T1,     2                               // KTEMP + Number of values in A -> KTEMP
+#else
+       addi            T1,     T1,     2                               // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+       mr              KKK,    T1
+       mr              K1,     T1
+       srawi.          L,      K1,     3                               // KTEMP / 8 -> L
+       ble             ZTRMM_L2x2_SUB0
+       cmpwi           cr0,    L,      1
+       ble             ZTRMM_L2x2_SUB4
+
+ZTRMM_L2x2_LOOP_START:
+
+       LOAD2x2_1
+       KERNEL2x2_I1
+       KERNEL2x2_2
+       KERNEL2x2_1
+       KERNEL2x2_2
+
+       KERNEL2x2_1
+       KERNEL2x2_2
+       KERNEL2x2_1
+       KERNEL2x2_2
+
+       addic.          L,      L,      -2
+       ble             ZTRMM_L2x2_LOOP_END
+
+       .align 5
+
+ZTRMM_L2x2_LOOP:
+
+       KERNEL2x2_1
+       KERNEL2x2_2
+       KERNEL2x2_1
+       KERNEL2x2_2
+
+       KERNEL2x2_1
+       KERNEL2x2_2
+       KERNEL2x2_1
+       KERNEL2x2_2
+
+       addic.          L,      L,      -1
+       bgt             ZTRMM_L2x2_LOOP
+
+ZTRMM_L2x2_LOOP_END:
+
+       KERNEL2x2_1
+       KERNEL2x2_2
+       KERNEL2x2_1
+       KERNEL2x2_2
+
+       KERNEL2x2_1
+       KERNEL2x2_2
+       KERNEL2x2_1
+       KERNEL2x2_E2
+
+       b               ZTRMM_L2x2_SUB1
+
+ZTRMM_L2x2_SUB4:
+
+       KERNEL2x2_SUBI1
+       KERNEL2x2_SUB1
+       KERNEL2x2_SUB1
+       KERNEL2x2_SUB1
+
+       KERNEL2x2_SUB1
+       KERNEL2x2_SUB1
+       KERNEL2x2_SUB1
+       KERNEL2x2_SUB1
+
+       b               ZTRMM_L2x2_SUB1
+
+ZTRMM_L2x2_SUB0:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+
+       KERNEL2x2_SUBI1
+
+       addic.          L,      L,      -1
+       ble             ZTRMM_L2x2_SAVE
+       b               ZTRMM_L2x2_SUB2
+
+ZTRMM_L2x2_SUB1:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+       ble             ZTRMM_L2x2_SAVE
+
+ZTRMM_L2x2_SUB2:
+
+       KERNEL2x2_SUB1
+
+       addic.          L,      L,      -1
+       bgt             ZTRMM_L2x2_SUB2
+
+ZTRMM_L2x2_SAVE:
+
+       SAVE2x2
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
+       slwi            T2,     T1,     5                       // TEMP1 * Number of values in B shifted -> TEMP2
+       slwi            T1,     T1,     5                       // TEMP1 * Number of values in A shifted -> TEMP1
+       add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
+       add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+       addi            KK,     KK,     2                               // KK += Number of values in A
+#endif
+
+
+ZTRMM_L2x2_END:
+
+ZTRMM_L2x1_BEGIN:
+
+       andi.           T1,     M,      1
+       ble             ZTRMM_L2x1_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       mr              BO,     B                                       // B -> BO
+#else
+       mr              BO,     B                                       // B -> BO
+       slwi            T1,     KK,     5                               // Number of values in B shifted
+       slwi            T2,     KK,     4                               // Number of values in A shifted
+       add             BO,     BO,     T1                              // Add values to BO
+       add             AO,     AO,     T2                              // Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       sub             T1,     K,      KK                              // K - KK -> TEMP1
+#else
+       mr              T1,     KK                                      // KK -> KTEMP
+#ifdef LEFT
+       addi            T1,     T1,     1                               // KTEMP + Number of values in A -> KTEMP
+#else
+       addi            T1,     T1,     2                               // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+       mr              KKK,    T1
+       mr              K1,     T1
+       srawi.          L,      K1,     3                               // KTEMP / 8 -> L
+       ble             ZTRMM_L2x1_SUB0
+       cmpwi           cr0,    L,      1
+       ble             ZTRMM_L2x1_SUB4
+
+ZTRMM_L2x1_LOOP_START:
+
+       LOAD2x1_1
+       KERNEL2x1_I1
+       KERNEL2x1_2
+       KERNEL2x1_1
+       KERNEL2x1_2
+
+       KERNEL2x1_1
+       KERNEL2x1_2
+       KERNEL2x1_1
+       KERNEL2x1_2
+
+       addic.          L,      L,      -2
+       ble             ZTRMM_L2x1_LOOP_END
+
+       .align 5
+
+ZTRMM_L2x1_LOOP:
+
+       KERNEL2x1_1
+       KERNEL2x1_2
+       KERNEL2x1_1
+       KERNEL2x1_2
+
+       KERNEL2x1_1
+       KERNEL2x1_2
+       KERNEL2x1_1
+       KERNEL2x1_2
+
+       addic.          L,      L,      -1
+       bgt             ZTRMM_L2x1_LOOP
+
+ZTRMM_L2x1_LOOP_END:
+
+       KERNEL2x1_1
+       KERNEL2x1_2
+       KERNEL2x1_1
+       KERNEL2x1_2
+
+       KERNEL2x1_1
+       KERNEL2x1_2
+       KERNEL2x1_1
+       KERNEL2x1_E2
+
+       b               ZTRMM_L2x1_SUB1
+
+ZTRMM_L2x1_SUB4:
+
+       KERNEL2x1_SUBI1
+       KERNEL2x1_SUB1
+       KERNEL2x1_SUB1
+       KERNEL2x1_SUB1
+
+       KERNEL2x1_SUB1
+       KERNEL2x1_SUB1
+       KERNEL2x1_SUB1
+       KERNEL2x1_SUB1
+
+       b               ZTRMM_L2x1_SUB1
+
+ZTRMM_L2x1_SUB0:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+
+       KERNEL2x1_SUBI1
+
+       addic.          L,      L,      -1
+       ble             ZTRMM_L2x1_SAVE
+       b               ZTRMM_L2x1_SUB2
+
+ZTRMM_L2x1_SUB1:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+       ble             ZTRMM_L2x1_SAVE
+
+ZTRMM_L2x1_SUB2:
+
+       KERNEL2x1_SUB1
+
+       addic.          L,      L,      -1
+       bgt             ZTRMM_L2x1_SUB2
+
+ZTRMM_L2x1_SAVE:
+
+       SAVE2x1
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
+       slwi            T2,     T1,     5                       // TEMP1 * Number of values in B shifted -> TEMP2
+       slwi            T1,     T1,     4                       // TEMP1 * Number of values in A shifted -> TEMP1
+       add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
+       add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+       addi            KK,     KK,     1                               // KK += Number of values in A
+#endif
+
+
+ZTRMM_L2x1_END:
+
+       slwi            T1,     K,      5
+       add             B,      B,      T1
+
+#if !defined(LEFT)
+       addi            KK,     KK,     2                                       // KK += Number of values in B
+#endif
+
+
+       addic.          J,      J,      -1
+       bgt             ZTRMM_L2_BEGIN
+
+       andi.           T2,     N,      1
+       ble             L999
+
+ZTRMM_L2_END:
+
+       b               ZTRMM_L1_BEGIN
+
+L999_H1:
+
+       b               L999
+
+ZTRMM_L1_BEGIN:
+
+       andi.           T1,     N,      1
+       ble             ZTRMM_L1_END
+       mr              CO,     C
+       mr              AO,     A
+
+#if defined(LEFT)
+       mr              KK,     OFFSET          // OFFSET -> KK
+#endif
+
+       srawi.          I,      M,      3
+       ble             ZTRMM_L1x8_END
+
+ZTRMM_L1x8_BEGIN:
+
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       mr              BO,     B                                       // B -> BO
+#else
+       mr              BO,     B                                       // B -> BO
+       slwi            T1,     KK,     4                               // Number of values in B shifted
+       slwi            T2,     KK,     7                               // Number of values in A shifted
+       add             BO,     BO,     T1                              // Add values to BO
+       add             AO,     AO,     T2                              // Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       sub             T1,     K,      KK                              // K - KK -> TEMP1
+#else
+       mr              T1,     KK                                      // KK -> KTEMP
+#ifdef LEFT
+       addi            T1,     T1,     8                               // KTEMP + Number of values in A -> KTEMP
+#else
+       addi            T1,     T1,     1                               // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+       mr              KKK,    T1
+       mr              K1,     T1
+       srawi.          L,      K1,     3                               // KTEMP / 8 -> L
+       ble             ZTRMM_L1x8_SUB0
+       cmpwi           cr0,    L,      1
+       ble             ZTRMM_L1x8_SUB4
+
+ZTRMM_L1x8_LOOP_START:
+
+       dcbt            AO,     PRE
+       LOAD1x8_1
+       dcbt            AO,     PRE
+       KERNEL1x8_I1
+       dcbt            AO,     PRE
+       KERNEL1x8_2
+       dcbt            AO,     PRE
+       KERNEL1x8_1
+       dcbt            AO,     PRE
+       KERNEL1x8_2
+
+       dcbt            AO,     PRE
+       KERNEL1x8_1
+       dcbt            AO,     PRE
+       KERNEL1x8_2
+       dcbt            AO,     PRE
+       KERNEL1x8_1
+       dcbt            AO,     PRE
+       KERNEL1x8_2
+
+       addic.          L,      L,      -2
+       ble             ZTRMM_L1x8_LOOP_END
+
+       .align 5
+
+ZTRMM_L1x8_LOOP:
+
+       dcbt            AO,     PRE
+       KERNEL1x8_1
+       dcbt            AO,     PRE
+       KERNEL1x8_2
+       dcbt            AO,     PRE
+       KERNEL1x8_1
+       dcbt            AO,     PRE
+       KERNEL1x8_2
+
+       dcbt            AO,     PRE
+       KERNEL1x8_1
+       dcbt            AO,     PRE
+       KERNEL1x8_2
+       dcbt            AO,     PRE
+       KERNEL1x8_1
+       dcbt            AO,     PRE
+       KERNEL1x8_2
+
+       addic.          L,      L,      -1
+       bgt             ZTRMM_L1x8_LOOP
+
+ZTRMM_L1x8_LOOP_END:
+
+       dcbt            AO,     PRE
+       KERNEL1x8_1
+       dcbt            AO,     PRE
+       KERNEL1x8_2
+       dcbt            AO,     PRE
+       KERNEL1x8_1
+       dcbt            AO,     PRE
+       KERNEL1x8_2
+
+       dcbt            AO,     PRE
+       KERNEL1x8_1
+       dcbt            AO,     PRE
+       KERNEL1x8_2
+       dcbt            AO,     PRE
+       KERNEL1x8_1
+       KERNEL1x8_E2
+
+       b               ZTRMM_L1x8_SUB1
+
+ZTRMM_L1x8_SUB4:
+
+       dcbt            AO,     PRE
+       KERNEL1x8_SUBI1
+       dcbt            AO,     PRE
+       KERNEL1x8_SUB1
+       dcbt            AO,     PRE
+       KERNEL1x8_SUB1
+       dcbt            AO,     PRE
+       KERNEL1x8_SUB1
+
+       KERNEL1x8_SUB1
+       KERNEL1x8_SUB1
+       KERNEL1x8_SUB1
+       KERNEL1x8_SUB1
+
+       b               ZTRMM_L1x8_SUB1
+
+ZTRMM_L1x8_SUB0:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+
+       KERNEL1x8_SUBI1
+
+       addic.          L,      L,      -1
+       ble             ZTRMM_L1x8_SAVE
+       b               ZTRMM_L1x8_SUB2
+
+ZTRMM_L1x8_SUB1:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+       ble             ZTRMM_L1x8_SAVE
+
+ZTRMM_L1x8_SUB2:
+
+       KERNEL1x8_SUB1
+
+       addic.          L,      L,      -1
+       bgt             ZTRMM_L1x8_SUB2
+
+ZTRMM_L1x8_SAVE:
+
+       SAVE1x8
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
+       slwi            T2,     T1,     4                       // TEMP1 * Number of values in B shifted -> TEMP2
+       slwi            T1,     T1,     7                       // TEMP1 * Number of values in A shifted -> TEMP1
+       add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
+       add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+       addi            KK,     KK,     8                               // KK += Number of values in A
+#endif
+
+
+       addic.          I,      I,      -1
+       bgt             ZTRMM_L1x8_BEGIN
+
+ZTRMM_L1x8_END:
+
+ZTRMM_L1x4_BEGIN:
+       andi.           T2,     M,      7
+       ble             ZTRMM_L1x1_END
+
+       andi.           T1,     M,      4
+       ble             ZTRMM_L1x4_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       mr              BO,     B                                       // B -> BO
+#else
+       mr              BO,     B                                       // B -> BO
+       slwi            T1,     KK,     4                               // Number of values in B shifted
+       slwi            T2,     KK,     6                               // Number of values in A shifted
+       add             BO,     BO,     T1                              // Add values to BO
+       add             AO,     AO,     T2                              // Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       sub             T1,     K,      KK                              // K - KK -> TEMP1
+#else
+       mr              T1,     KK                                      // KK -> KTEMP
+#ifdef LEFT
+       addi            T1,     T1,     4                               // KTEMP + Number of values in A -> KTEMP
+#else
+       addi            T1,     T1,     1                               // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+       mr              KKK,    T1
+       mr              K1,     T1
+       srawi.          L,      K1,     3                               // KTEMP / 8 -> L
+       ble             ZTRMM_L1x4_SUB0
+       cmpwi           cr0,    L,      1
+       ble             ZTRMM_L1x4_SUB4
+
+ZTRMM_L1x4_LOOP_START:
+
+       LOAD1x4_1
+       KERNEL1x4_I1
+       KERNEL1x4_2
+       KERNEL1x4_1
+       KERNEL1x4_2
+
+       KERNEL1x4_1
+       KERNEL1x4_2
+       KERNEL1x4_1
+       KERNEL1x4_2
+
+       addic.          L,      L,      -2
+       ble             ZTRMM_L1x4_LOOP_END
+
+       .align 5
+
+ZTRMM_L1x4_LOOP:
+
+       KERNEL1x4_1
+       KERNEL1x4_2
+       KERNEL1x4_1
+       KERNEL1x4_2
+
+       KERNEL1x4_1
+       KERNEL1x4_2
+       KERNEL1x4_1
+       KERNEL1x4_2
+
+       addic.          L,      L,      -1
+       bgt             ZTRMM_L1x4_LOOP
+
+ZTRMM_L1x4_LOOP_END:
+
+       KERNEL1x4_1
+       KERNEL1x4_2
+       KERNEL1x4_1
+       KERNEL1x4_2
+
+       KERNEL1x4_1
+       KERNEL1x4_2
+       KERNEL1x4_1
+       KERNEL1x4_E2
+
+       b               ZTRMM_L1x4_SUB1
+
+ZTRMM_L1x4_SUB4:
+
+       KERNEL1x4_SUBI1
+       KERNEL1x4_SUB1
+       KERNEL1x4_SUB1
+       KERNEL1x4_SUB1
+
+       KERNEL1x4_SUB1
+       KERNEL1x4_SUB1
+       KERNEL1x4_SUB1
+       KERNEL1x4_SUB1
+
+       b               ZTRMM_L1x4_SUB1
+
+ZTRMM_L1x4_SUB0:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+
+       KERNEL1x4_SUBI1
+
+       addic.          L,      L,      -1
+       ble             ZTRMM_L1x4_SAVE
+       b               ZTRMM_L1x4_SUB2
+
+ZTRMM_L1x4_SUB1:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+       ble             ZTRMM_L1x4_SAVE
+
+ZTRMM_L1x4_SUB2:
+
+       KERNEL1x4_SUB1
+
+       addic.          L,      L,      -1
+       bgt             ZTRMM_L1x4_SUB2
+
+ZTRMM_L1x4_SAVE:
+
+       SAVE1x4
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
+       slwi            T2,     T1,     4                       // TEMP1 * Number of values in B shifted -> TEMP2
+       slwi            T1,     T1,     6                       // TEMP1 * Number of values in A shifted -> TEMP1
+       add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
+       add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+       addi            KK,     KK,     4                               // KK += Number of values in A
+#endif
+
+
+ZTRMM_L1x4_END:
+
+ZTRMM_L1x2_BEGIN:
+
+       andi.           T1,     M,      2
+       ble             ZTRMM_L1x2_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       mr              BO,     B                                       // B -> BO
+#else
+       mr              BO,     B                                       // B -> BO
+       slwi            T1,     KK,     4                               // Number of values in B shifted
+       slwi            T2,     KK,     5                               // Number of values in A shifted
+       add             BO,     BO,     T1                              // Add values to BO
+       add             AO,     AO,     T2                              // Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       sub             T1,     K,      KK                              // K - KK -> TEMP1
+#else
+       mr              T1,     KK                                      // KK -> KTEMP
+#ifdef LEFT
+       addi            T1,     T1,     2                               // KTEMP + Number of values in A -> KTEMP
+#else
+       addi            T1,     T1,     1                               // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+       mr              KKK,    T1
+       mr              K1,     T1
+       srawi.          L,      K1,     3                               // KTEMP / 8 -> L
+       ble             ZTRMM_L1x2_SUB0
+       cmpwi           cr0,    L,      1
+       ble             ZTRMM_L1x2_SUB4
+
+ZTRMM_L1x2_LOOP_START:
+
+       LOAD1x2_1
+       KERNEL1x2_I1
+       KERNEL1x2_2
+       KERNEL1x2_1
+       KERNEL1x2_2
+
+       KERNEL1x2_1
+       KERNEL1x2_2
+       KERNEL1x2_1
+       KERNEL1x2_2
+
+       addic.          L,      L,      -2
+       ble             ZTRMM_L1x2_LOOP_END
+
+       .align 5
+
+ZTRMM_L1x2_LOOP:
+
+       KERNEL1x2_1
+       KERNEL1x2_2
+       KERNEL1x2_1
+       KERNEL1x2_2
+
+       KERNEL1x2_1
+       KERNEL1x2_2
+       KERNEL1x2_1
+       KERNEL1x2_2
+
+       addic.          L,      L,      -1
+       bgt             ZTRMM_L1x2_LOOP
+
+ZTRMM_L1x2_LOOP_END:
+
+       KERNEL1x2_1
+       KERNEL1x2_2
+       KERNEL1x2_1
+       KERNEL1x2_2
+
+       KERNEL1x2_1
+       KERNEL1x2_2
+       KERNEL1x2_1
+       KERNEL1x2_E2
+
+       b               ZTRMM_L1x2_SUB1
+
+ZTRMM_L1x2_SUB4:
+
+       KERNEL1x2_SUBI1
+       KERNEL1x2_SUB1
+       KERNEL1x2_SUB1
+       KERNEL1x2_SUB1
+
+       KERNEL1x2_SUB1
+       KERNEL1x2_SUB1
+       KERNEL1x2_SUB1
+       KERNEL1x2_SUB1
+
+       b               ZTRMM_L1x2_SUB1
+
+ZTRMM_L1x2_SUB0:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+
+       KERNEL1x2_SUBI1
+
+       addic.          L,      L,      -1
+       ble             ZTRMM_L1x2_SAVE
+       b               ZTRMM_L1x2_SUB2
+
+ZTRMM_L1x2_SUB1:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+       ble             ZTRMM_L1x2_SAVE
+
+ZTRMM_L1x2_SUB2:
+
+       KERNEL1x2_SUB1
+
+       addic.          L,      L,      -1
+       bgt             ZTRMM_L1x2_SUB2
+
+ZTRMM_L1x2_SAVE:
+
+       SAVE1x2
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
+       slwi            T2,     T1,     4                       // TEMP1 * Number of values in B shifted -> TEMP2
+       slwi            T1,     T1,     5                       // TEMP1 * Number of values in A shifted -> TEMP1
+       add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
+       add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+       addi            KK,     KK,     2                               // KK += Number of values in A
+#endif
+
+
+ZTRMM_L1x2_END:
+
+ZTRMM_L1x1_BEGIN:
+
+       andi.           T1,     M,      1
+       ble             ZTRMM_L1x1_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       mr              BO,     B                                       // B -> BO
+#else
+       mr              BO,     B                                       // B -> BO
+       slwi            T1,     KK,     4                               // Number of values in B shifted
+       slwi            T2,     KK,     4                               // Number of values in A shifted
+       add             BO,     BO,     T1                              // Add values to BO
+       add             AO,     AO,     T2                              // Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       sub             T1,     K,      KK                              // K - KK -> TEMP1
+#else
+       mr              T1,     KK                                      // KK -> KTEMP
+#ifdef LEFT
+       addi            T1,     T1,     1                               // KTEMP + Number of values in A -> KTEMP
+#else
+       addi            T1,     T1,     1                               // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+       mr              KKK,    T1
+       mr              K1,     T1
+       srawi.          L,      K1,     3                               // KTEMP / 8 -> L
+       ble             ZTRMM_L1x1_SUB0
+       cmpwi           cr0,    L,      1
+       ble             ZTRMM_L1x1_SUB4
+
+ZTRMM_L1x1_LOOP_START:
+
+       LOAD1x1_1
+       KERNEL1x1_I1
+       KERNEL1x1_2
+       KERNEL1x1_1
+       KERNEL1x1_2
+
+       KERNEL1x1_1
+       KERNEL1x1_2
+       KERNEL1x1_1
+       KERNEL1x1_2
+
+       addic.          L,      L,      -2
+       ble             ZTRMM_L1x1_LOOP_END
+
+       .align 5
+
+ZTRMM_L1x1_LOOP:
+
+       KERNEL1x1_1
+       KERNEL1x1_2
+       KERNEL1x1_1
+       KERNEL1x1_2
+
+       KERNEL1x1_1
+       KERNEL1x1_2
+       KERNEL1x1_1
+       KERNEL1x1_2
+
+       addic.          L,      L,      -1
+       bgt             ZTRMM_L1x1_LOOP
+
+ZTRMM_L1x1_LOOP_END:
+
+       KERNEL1x1_1
+       KERNEL1x1_2
+       KERNEL1x1_1
+       KERNEL1x1_2
+
+       KERNEL1x1_1
+       KERNEL1x1_2
+       KERNEL1x1_1
+       KERNEL1x1_E2
+
+       b               ZTRMM_L1x1_SUB1
+
+ZTRMM_L1x1_SUB4:
+
+       KERNEL1x1_SUBI1
+       KERNEL1x1_SUB1
+       KERNEL1x1_SUB1
+       KERNEL1x1_SUB1
+
+       KERNEL1x1_SUB1
+       KERNEL1x1_SUB1
+       KERNEL1x1_SUB1
+       KERNEL1x1_SUB1
+
+       b               ZTRMM_L1x1_SUB1
+
+ZTRMM_L1x1_SUB0:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+
+       KERNEL1x1_SUBI1
+
+       addic.          L,      L,      -1
+       ble             ZTRMM_L1x1_SAVE
+       b               ZTRMM_L1x1_SUB2
+
+ZTRMM_L1x1_SUB1:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+       ble             ZTRMM_L1x1_SAVE
+
+ZTRMM_L1x1_SUB2:
+
+       KERNEL1x1_SUB1
+
+       addic.          L,      L,      -1
+       bgt             ZTRMM_L1x1_SUB2
+
+ZTRMM_L1x1_SAVE:
+
+       SAVE1x1
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
+       slwi            T2,     T1,     4                       // TEMP1 * Number of values in B shifted -> TEMP2
+       slwi            T1,     T1,     4                       // TEMP1 * Number of values in A shifted -> TEMP1
+       add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
+       add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+       addi            KK,     KK,     1                               // KK += Number of values in A
+#endif
+
+
+ZTRMM_L1x1_END:
+
+#if !defined(LEFT)
+       addi            KK,     KK,     1                                       // KK += Number of values in B
+#endif
+
+
+ZTRMM_L1_END:
diff --git a/param.h b/param.h
index c46a1e9..e7dca2c 100644 (file)
--- a/param.h
+++ b/param.h
@@ -1962,35 +1962,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if defined(POWER8)
 
 #define SNUMOPT                4
-#define DNUMOPT                4
+#define DNUMOPT                8
 
 #define GEMM_DEFAULT_OFFSET_A  384
 #define GEMM_DEFAULT_OFFSET_B 1024
 #define GEMM_DEFAULT_ALIGN 0x03fffUL
 
-#define SGEMM_DEFAULT_UNROLL_M 4
-#define SGEMM_DEFAULT_UNROLL_N 4
-#define DGEMM_DEFAULT_UNROLL_M 4
+#define SGEMM_DEFAULT_UNROLL_M 2
+#define SGEMM_DEFAULT_UNROLL_N 2
+#define DGEMM_DEFAULT_UNROLL_M 16
 #define DGEMM_DEFAULT_UNROLL_N 4
 #define CGEMM_DEFAULT_UNROLL_M 2
-#define CGEMM_DEFAULT_UNROLL_N 4
-#define ZGEMM_DEFAULT_UNROLL_M 2
-#define ZGEMM_DEFAULT_UNROLL_N 4
+#define CGEMM_DEFAULT_UNROLL_N 2
+#define ZGEMM_DEFAULT_UNROLL_M 8
+#define ZGEMM_DEFAULT_UNROLL_N 2
 
 #define SGEMM_DEFAULT_P  992
 #define DGEMM_DEFAULT_P  480
 #define CGEMM_DEFAULT_P  488
-#define ZGEMM_DEFAULT_P  248
+#define ZGEMM_DEFAULT_P  240
 
 #define SGEMM_DEFAULT_Q  504
-#define DGEMM_DEFAULT_Q  504
+#define DGEMM_DEFAULT_Q  720
 #define CGEMM_DEFAULT_Q  400
-#define ZGEMM_DEFAULT_Q  400
+#define ZGEMM_DEFAULT_Q  360
+
+#define DGEMM_DEFAULT_R 14400
+#define ZGEMM_DEFAULT_R 7200
 
 #define SYMV_P  8
 
 #endif
 
+
 #if defined(SPARC) && defined(V7)
 
 #define SNUMOPT                4