From: Werner Saar <wernsaar@googlemail.com>
Date: Tue, 1 Mar 2016 06:33:56 +0000 (+0100)
Subject: added dgemm-, dtrmm-, zgemm- and ztrmm-kernel for power8
X-Git-Tag: v0.2.16^2~1^2~15^2~8
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=b752858d6c37c0aa393c4a0636d3cda2ff2da179;p=platform%2Fupstream%2Fopenblas.git

added dgemm-, dtrmm-, zgemm- and ztrmm-kernel for power8
---

diff --git a/common_power.h b/common_power.h
index ab331b0..64e052f 100644
--- a/common_power.h
+++ b/common_power.h
@@ -236,7 +236,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
 #define HAVE_PREFETCH
 #endif
 
-#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL)
+#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8)
 #define DCBT_ARG	0
 #else
 #define DCBT_ARG	8
@@ -258,6 +258,13 @@ static inline int blas_quickdivide(blasint x, blasint y){
 #define L1_PREFETCH	dcbtst
 #endif
 
+#if defined(POWER8)
+#define L1_DUALFETCH
+#define L1_PREFETCHSIZE (16 + 128 * 100)
+#define L1_PREFETCH	dcbtst
+#endif
+
+#
 #ifndef L1_PREFETCH
 #define L1_PREFETCH	dcbt
 #endif
diff --git a/cpuid_power.c b/cpuid_power.c
index 6790076..951204a 100644
--- a/cpuid_power.c
+++ b/cpuid_power.c
@@ -66,7 +66,7 @@ char *cpuname[] = {
   "POWER6",
   "CELL",
   "PPCG4",
-  "POWER8",
+  "POWER8"
 };
 
 char *lowercpuname[] = {
@@ -78,7 +78,7 @@ char *lowercpuname[] = {
   "power6",
   "cell",
   "ppcg4",
-  "power8",
+  "power8"
 };
 
 char *corename[] = {
@@ -90,7 +90,7 @@ char *corename[] = {
   "POWER6",
   "CELL",
   "PPCG4",
-  "POWER8",
+  "POWER8"
 };
 
 int detect(void){
diff --git a/getarch.c b/getarch.c
index ff607a4..f9c49e6 100644
--- a/getarch.c
+++ b/getarch.c
@@ -552,7 +552,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CORENAME  "POWER5"
 #endif
 
-#if defined(FORCE_POWER6) || defined(FORCE_POWER7) || defined(FORCE_POWER8)
+#if defined(FORCE_POWER6) || defined(FORCE_POWER7)
 #define FORCE
 #define ARCHITECTURE    "POWER"
 #define SUBARCHITECTURE "POWER6"
@@ -565,7 +565,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CORENAME  "POWER6"
 #endif
 
-#if defined(FORCE_POWER8)
+#if defined(FORCE_POWER8) 
 #define FORCE
 #define ARCHITECTURE    "POWER"
 #define SUBARCHITECTURE "POWER8"
@@ -578,6 +578,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CORENAME  "POWER8"
 #endif
 
+
 #ifdef FORCE_PPCG4
 #define FORCE
 #define ARCHITECTURE    "POWER"
diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3
index 63e675b..8e68274 100644
--- a/kernel/Makefile.L3
+++ b/kernel/Makefile.L3
@@ -36,6 +36,11 @@ ifeq ($(CORE), HASWELL)
 USE_TRMM = 1
 endif
 
+ifeq ($(CORE), POWER8)
+USE_TRMM = 1
+endif
+
+
 
 
 SKERNELOBJS	+= \
diff --git a/kernel/power/KERNEL b/kernel/power/KERNEL
index cb9ed84..eae60cd 100644
--- a/kernel/power/KERNEL
+++ b/kernel/power/KERNEL
@@ -1,57 +1,3 @@
-SGEMM_BETA = gemm_beta.S
-DGEMM_BETA = gemm_beta.S
-CGEMM_BETA = zgemm_beta.S
-ZGEMM_BETA = zgemm_beta.S
-
-
-ifndef SSYMV_U_KERNEL
-SSYMV_U_KERNEL =  symv_U.S
-endif
-
-ifndef SSYMV_L_KERNEL
-SSYMV_L_KERNEL =  symv_L.S
-endif
-
-ifndef DSYMV_U_KERNEL
-DSYMV_U_KERNEL =  symv_U.S
-endif
-
-ifndef DSYMV_L_KERNEL
-DSYMV_L_KERNEL =  symv_L.S
-endif
-
-ifndef CSYMV_U_KERNEL
-CSYMV_U_KERNEL =  zsymv_U.S
-endif
-
-ifndef CSYMV_L_KERNEL
-CSYMV_L_KERNEL =  zsymv_L.S
-endif
-
-ifndef ZSYMV_U_KERNEL
-ZSYMV_U_KERNEL =  zsymv_U.S
-endif
-
-ifndef ZSYMV_L_KERNEL
-ZSYMV_L_KERNEL =  zsymv_L.S
-endif
-
-ifndef CHEMV_U_KERNEL
-CHEMV_U_KERNEL =  zsymv_U.S
-endif
-
-ifndef CHEMV_L_KERNEL
-CHEMV_L_KERNEL =  zsymv_L.S
-endif
-
-ifndef ZHEMV_U_KERNEL
-ZHEMV_U_KERNEL =  zsymv_U.S
-endif
-
-ifndef ZHEMV_L_KERNEL
-ZHEMV_L_KERNEL =  zsymv_L.S
-endif
-
 ifndef STRSMKERNEL_LN
 STRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
 endif
diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8
index 344b205..3a627e4 100644
--- a/kernel/power/KERNEL.POWER8
+++ b/kernel/power/KERNEL.POWER8
@@ -1,56 +1,173 @@
-SGEMMKERNEL    =  gemm_kernel_power6.S
-SGEMMINCOPY    =
-SGEMMITCOPY    =
-SGEMMONCOPY    =  gemm_ncopy_4.S
-SGEMMOTCOPY    =  gemm_tcopy_4.S
-SGEMMINCOPYOBJ =
-SGEMMITCOPYOBJ =
-SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
-SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
-DGEMMKERNEL    =  gemm_kernel_power6.S
-DGEMMINCOPY    =
-DGEMMITCOPY    =
+SGEMM_BETA = ../generic/gemm_beta.c
+DGEMM_BETA = ../generic/gemm_beta.c
+CGEMM_BETA = ../generic/zgemm_beta.c
+ZGEMM_BETA = ../generic/zgemm_beta.c
+
+STRMMKERNEL	= ../generic/trmmkernel_2x2.c
+DTRMMKERNEL	= dtrmm_kernel_16x4_power8.S
+CTRMMKERNEL	= ../generic/ztrmmkernel_2x2.c
+ZTRMMKERNEL	= ztrmm_kernel_8x2_power8.S
+
+SGEMMKERNEL    =  ../generic/gemmkernel_2x2.c
+SGEMMONCOPY    =  ../generic/gemm_ncopy_2.c
+SGEMMOTCOPY    =  ../generic/gemm_tcopy_2.c
+SGEMMONCOPYOBJ =  sgemm_oncopy.o
+SGEMMOTCOPYOBJ =  sgemm_otcopy.o
+
+DGEMMKERNEL    =  dgemm_kernel_16x4_power8.S
+DGEMMINCOPY    = ../generic/gemm_ncopy_16.c
+DGEMMITCOPY    = ../generic/gemm_tcopy_16.c
 DGEMMONCOPY    =  gemm_ncopy_4.S
 DGEMMOTCOPY    =  gemm_tcopy_4.S
-DGEMMINCOPYOBJ =
-DGEMMITCOPYOBJ =
-DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
-DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
-CGEMMKERNEL    =  zgemm_kernel_power6.S
-CGEMMINCOPY    =  ../generic/zgemm_ncopy_2.c
-CGEMMITCOPY    =  ../generic/zgemm_tcopy_2.c
-CGEMMONCOPY    =  ../generic/zgemm_ncopy_4.c
-CGEMMOTCOPY    =  ../generic/zgemm_tcopy_4.c
-CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
-CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
-CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
-CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
-ZGEMMKERNEL    =  zgemm_kernel_power6.S
-ZGEMMINCOPY    =  ../generic/zgemm_ncopy_2.c
-ZGEMMITCOPY    =  ../generic/zgemm_tcopy_2.c
-ZGEMMONCOPY    =  ../generic/zgemm_ncopy_4.c
-ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_4.c
-ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
-ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
-ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
-ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-STRSMKERNEL_LN	=  trsm_kernel_power6_LN.S
-STRSMKERNEL_LT	=  trsm_kernel_power6_LT.S
-STRSMKERNEL_RN	=  trsm_kernel_power6_LT.S
-STRSMKERNEL_RT	=  trsm_kernel_power6_RT.S
-
-DTRSMKERNEL_LN	=  trsm_kernel_power6_LN.S
-DTRSMKERNEL_LT	=  trsm_kernel_power6_LT.S
-DTRSMKERNEL_RN	=  trsm_kernel_power6_LT.S
-DTRSMKERNEL_RT	=  trsm_kernel_power6_RT.S
-
-CTRSMKERNEL_LN	=  ztrsm_kernel_power6_LN.S
-CTRSMKERNEL_LT	=  ztrsm_kernel_power6_LT.S
-CTRSMKERNEL_RN	=  ztrsm_kernel_power6_LT.S
-CTRSMKERNEL_RT	=  ztrsm_kernel_power6_RT.S
-
-ZTRSMKERNEL_LN	=  ztrsm_kernel_power6_LN.S
-ZTRSMKERNEL_LT	=  ztrsm_kernel_power6_LT.S
-ZTRSMKERNEL_RN	=  ztrsm_kernel_power6_LT.S
-ZTRSMKERNEL_RT	=  ztrsm_kernel_power6_RT.S
+DGEMMINCOPYOBJ = dgemm_incopy.o
+DGEMMITCOPYOBJ = dgemm_itcopy.o
+DGEMMONCOPYOBJ = dgemm_oncopy.o
+DGEMMOTCOPYOBJ = dgemm_otcopy.o
+
+CGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
+CGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
+CGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
+CGEMMONCOPYOBJ =  cgemm_oncopy.o
+CGEMMOTCOPYOBJ =  cgemm_otcopy.o
+
+ZGEMMKERNEL    = zgemm_kernel_8x2_power8.S
+ZGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
+ZGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
+ZGEMMINCOPY    = ../generic/zgemm_ncopy_8.c
+ZGEMMITCOPY    = ../generic/zgemm_tcopy_8.c
+ZGEMMONCOPYOBJ =  zgemm_oncopy.o
+ZGEMMOTCOPYOBJ =  zgemm_otcopy.o
+ZGEMMINCOPYOBJ =  zgemm_incopy.o
+ZGEMMITCOPYOBJ =  zgemm_itcopy.o
+
+STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
+
+DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+#Todo: CGEMM3MKERNEL should be 4x4 blocksizes.
+CGEMM3MKERNEL    =  zgemm3m_kernel_8x4_sse3.S
+ZGEMM3MKERNEL    =  zgemm3m_kernel_4x4_sse3.S
+
+#Pure C for other kernels
+SAMAXKERNEL  = ../arm/amax.c
+DAMAXKERNEL  = ../arm/amax.c
+CAMAXKERNEL  = ../arm/zamax.c
+ZAMAXKERNEL  = ../arm/zamax.c
+
+SAMINKERNEL  = ../arm/amin.c
+DAMINKERNEL  = ../arm/amin.c
+CAMINKERNEL  = ../arm/zamin.c
+ZAMINKERNEL  = ../arm/zamin.c
+
+SMAXKERNEL   = ../arm/max.c
+DMAXKERNEL   = ../arm/max.c
+
+SMINKERNEL   = ../arm/min.c
+DMINKERNEL   = ../arm/min.c
+
+ISAMAXKERNEL = ../arm/iamax.c
+IDAMAXKERNEL = ../arm/iamax.c
+ICAMAXKERNEL = ../arm/izamax.c
+IZAMAXKERNEL = ../arm/izamax.c
+
+ISAMINKERNEL = ../arm/iamin.c
+IDAMINKERNEL = ../arm/iamin.c
+ICAMINKERNEL = ../arm/izamin.c
+IZAMINKERNEL = ../arm/izamin.c
+
+ISMAXKERNEL  = ../arm/imax.c
+IDMAXKERNEL  = ../arm/imax.c
+
+ISMINKERNEL  = ../arm/imin.c
+IDMINKERNEL  = ../arm/imin.c
+
+SASUMKERNEL  = ../arm/asum.c
+DASUMKERNEL  = ../arm/asum.c
+CASUMKERNEL  = ../arm/zasum.c
+ZASUMKERNEL  = ../arm/zasum.c
+
+SAXPYKERNEL  = ../arm/axpy.c
+DAXPYKERNEL  = ../arm/axpy.c
+CAXPYKERNEL  = ../arm/zaxpy.c
+ZAXPYKERNEL  = ../arm/zaxpy.c
+
+SCOPYKERNEL  = ../arm/copy.c
+DCOPYKERNEL  = ../arm/copy.c
+CCOPYKERNEL  = ../arm/zcopy.c
+ZCOPYKERNEL  = ../arm/zcopy.c
+
+SDOTKERNEL   = ../arm/dot.c
+DDOTKERNEL   = ../arm/dot.c
+CDOTKERNEL   = ../arm/zdot.c
+ZDOTKERNEL   = ../arm/zdot.c
+
+SNRM2KERNEL  = ../arm/nrm2.c
+DNRM2KERNEL  = ../arm/nrm2.c
+CNRM2KERNEL  = ../arm/znrm2.c
+ZNRM2KERNEL  = ../arm/znrm2.c
+
+SROTKERNEL   = ../arm/rot.c
+DROTKERNEL   = ../arm/rot.c
+CROTKERNEL   = ../arm/zrot.c
+ZROTKERNEL   = ../arm/zrot.c
+
+SSCALKERNEL  = ../arm/scal.c
+DSCALKERNEL  = ../arm/scal.c
+CSCALKERNEL  = ../arm/zscal.c
+ZSCALKERNEL  = ../arm/zscal.c
+
+SSWAPKERNEL  = ../arm/swap.c
+DSWAPKERNEL  = ../arm/swap.c
+CSWAPKERNEL  = ../arm/zswap.c
+ZSWAPKERNEL  = ../arm/zswap.c
+
+SGEMVNKERNEL = ../arm/gemv_n.c
+DGEMVNKERNEL = ../arm/gemv_n.c
+CGEMVNKERNEL = ../arm/zgemv_n.c
+ZGEMVNKERNEL = ../arm/zgemv_n.c
+
+SGEMVTKERNEL = ../arm/gemv_t.c
+DGEMVTKERNEL = ../arm/gemv_t.c
+CGEMVTKERNEL = ../arm/zgemv_t.c
+ZGEMVTKERNEL = ../arm/zgemv_t.c
+
+SSYMV_U_KERNEL =  ../generic/symv_k.c
+SSYMV_L_KERNEL =  ../generic/symv_k.c
+DSYMV_U_KERNEL =  ../generic/symv_k.c
+DSYMV_L_KERNEL =  ../generic/symv_k.c
+QSYMV_U_KERNEL =  ../generic/symv_k.c
+QSYMV_L_KERNEL =  ../generic/symv_k.c
+CSYMV_U_KERNEL =  ../generic/zsymv_k.c
+CSYMV_L_KERNEL =  ../generic/zsymv_k.c
+ZSYMV_U_KERNEL =  ../generic/zsymv_k.c
+ZSYMV_L_KERNEL =  ../generic/zsymv_k.c
+XSYMV_U_KERNEL =  ../generic/zsymv_k.c
+XSYMV_L_KERNEL =  ../generic/zsymv_k.c
+
+ZHEMV_U_KERNEL =  ../generic/zhemv_k.c
+ZHEMV_L_KERNEL =  ../generic/zhemv_k.c
+
+LSAME_KERNEL = ../generic/lsame.c
+SCABS_KERNEL	= ../generic/cabs.c
+DCABS_KERNEL	= ../generic/cabs.c
+QCABS_KERNEL	= ../generic/cabs.c
+
+#Dump kernel
+CGEMM3MKERNEL    = ../generic/zgemm3mkernel_dump.c
+ZGEMM3MKERNEL    = ../generic/zgemm3mkernel_dump.c
diff --git a/kernel/power/def_vsx.h b/kernel/power/def_vsx.h
new file mode 100644
index 0000000..c2d29e2
--- /dev/null
+++ b/kernel/power/def_vsx.h
@@ -0,0 +1,64 @@
+#define vs0 0
+#define vs1 1
+#define vs2 2
+#define vs3 3
+#define vs4 4
+#define vs5 5
+#define vs6 6
+#define vs7 7
+#define vs8 8
+#define vs9 9
+#define vs10 10
+#define vs11 11
+#define vs12 12
+#define vs13 13
+#define vs14 14
+#define vs15 15
+#define vs16 16
+#define vs17 17
+#define vs18 18
+#define vs19 19
+#define vs20 20
+#define vs21 21
+#define vs22 22
+#define vs23 23
+#define vs24 24
+#define vs25 25
+#define vs26 26
+#define vs27 27
+#define vs28 28
+#define vs29 29
+#define vs30 30
+#define vs31 31
+#define vs32 32
+#define vs33 33
+#define vs34 34
+#define vs35 35
+#define vs36 36
+#define vs37 37
+#define vs38 38
+#define vs39 39
+#define vs40 40
+#define vs41 41
+#define vs42 42
+#define vs43 43
+#define vs44 44
+#define vs45 45
+#define vs46 46
+#define vs47 47
+#define vs48 48
+#define vs49 49
+#define vs50 50
+#define vs51 51
+#define vs52 52
+#define vs53 53
+#define vs54 54
+#define vs55 55
+#define vs56 56
+#define vs57 57
+#define vs58 58
+#define vs59 59
+#define vs60 60
+#define vs61 61
+#define vs62 62
+#define vs63 63
diff --git a/kernel/power/dgemm_kernel_16x4_power8.S b/kernel/power/dgemm_kernel_16x4_power8.S
new file mode 100644
index 0000000..53205ad
--- /dev/null
+++ b/kernel/power/dgemm_kernel_16x4_power8.S
@@ -0,0 +1,313 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+#ifndef __64BIT__
+#define LOAD	lwz
+#else
+#define LOAD	ld
+#endif
+
+#ifdef __64BIT__
+#define STACKSIZE 320
+#define ALPHA_SP   296(SP)
+#define FZERO	304(SP)
+#else
+#define STACKSIZE 240
+#define ALPHA_SP   224(SP)
+#define FZERO	232(SP)
+#endif
+
+#define	M	r3
+#define	N	r4
+#define	K	r5
+
+#ifdef linux
+#ifndef __64BIT__
+#define A	r6
+#define	B	r7
+#define	C	r8
+#define	LDC	r9
+#define OFFSET	r10
+#else
+#define A	r7
+#define	B	r8
+#define	C	r9
+#define	LDC	r10
+#define OFFSET	r6
+#endif
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#if !defined(__64BIT__) && defined(DOUBLE)
+#define A	r8
+#define	B	r9
+#define	C	r10
+#define	LDC	r7
+#define OFFSET	r6
+#else
+#define A	r7
+#define	B	r8
+#define	C	r9
+#define	LDC	r10
+#define OFFSET	r6
+#endif
+#endif
+
+#define alpha_r vs18
+
+#define o0	0
+
+#define o8	r15
+#define o24	r16
+#define ALPHA	r17
+#define L	r18
+#define T1	r19
+#define KK	r20
+#define BB	r21
+#define	I	r22
+#define J	r23
+#define AO	r24
+#define	BO	r25
+#define	CO 	r26
+#define o16	r27
+#define	o32	r28
+#define	o48	r29
+
+#define PRE	r30
+#define T2	r31
+
+#include "dgemm_macros_16x4_power8.S"
+
+
+#ifndef NEEDPARAM
+
+	PROLOGUE
+	PROFCODE
+
+	addi	SP, SP, -STACKSIZE
+	li	r0, 0
+
+	stfd	f14,    0(SP)
+	stfd	f15,    8(SP)
+	stfd	f16,   16(SP)
+	stfd	f17,   24(SP)
+
+	stfd	f18,   32(SP)
+	stfd	f19,   40(SP)
+	stfd	f20,   48(SP)
+	stfd	f21,   56(SP)
+
+	stfd	f22,   64(SP)
+	stfd	f23,   72(SP)
+	stfd	f24,   80(SP)
+	stfd	f25,   88(SP)
+
+	stfd	f26,   96(SP)
+	stfd	f27,  104(SP)
+	stfd	f28,  112(SP)
+	stfd	f29,  120(SP)
+
+	stfd	f30,  128(SP)
+	stfd	f31,  136(SP)
+
+#ifdef __64BIT__
+	std	r31,  144(SP)
+	std	r30,  152(SP)
+	std	r29,  160(SP)
+	std	r28,  168(SP)
+	std	r27,  176(SP)
+	std	r26,  184(SP)
+	std	r25,  192(SP)
+	std	r24,  200(SP)
+	std	r23,  208(SP)
+	std	r22,  216(SP)
+	std	r21,  224(SP)
+	std	r20,  232(SP)
+	std	r19,  240(SP)
+	std	r18,  248(SP)
+	std	r17,  256(SP)
+	std	r16,  264(SP)
+	std	r15,  272(SP)
+#else
+	stw	r31,  144(SP)
+	stw	r30,  148(SP)
+	stw	r29,  152(SP)
+	stw	r28,  156(SP)
+	stw	r27,  160(SP)
+	stw	r26,  164(SP)
+	stw	r25,  168(SP)
+	stw	r24,  172(SP)
+	stw	r23,  176(SP)
+	stw	r22,  180(SP)
+	stw	r21,  184(SP)
+	stw	r20,  188(SP)
+	stw	r19,  192(SP)
+	stw	r18,  196(SP)
+	stw	r17,  200(SP)
+	stw	r16,  204(SP)
+	stw	r15,  208(SP)
+#endif
+
+	stfd	f1,  ALPHA_SP
+	stw	r0,  FZERO
+
+#if defined(_AIX) || defined(__APPLE__)
+#if !defined(__64BIT__) && defined(DOUBLE)
+	lwz	LDC, FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+#endif
+
+	slwi	LDC, LDC, BASE_SHIFT
+
+#if defined(TRMMKERNEL)
+#if defined(linux) && defined(__64BIT__)
+	ld	OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#ifdef __64BIT__
+	ld	OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
+#else
+#ifdef DOUBLE
+	lwz	OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
+#else
+	lwz	OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+#endif
+#endif
+#endif
+
+
+	cmpwi	cr0, M, 0
+	ble	L999_H1
+	cmpwi	cr0, N, 0
+	ble	L999_H1
+	cmpwi	cr0, K, 0
+	ble	L999_H1
+
+#ifdef __64BIT__
+	addi	ALPHA, SP, 296
+#else
+	addi	ALPHA, SP, 224
+#endif
+
+	li	PRE, 256 
+	li	o8 , 8
+	li	o16, 16
+	li	o24, 24
+	li	o32, 32
+	li	o48, 48
+
+	lxvdsx	alpha_r, 0, ALPHA
+
+#include "dgemm_logic_16x4_power8.S"
+
+L999:
+	addi	r3, 0, 0
+
+	lfd	f14,    0(SP)
+	lfd	f15,    8(SP)
+	lfd	f16,   16(SP)
+	lfd	f17,   24(SP)
+
+	lfd	f18,   32(SP)
+	lfd	f19,   40(SP)
+	lfd	f20,   48(SP)
+	lfd	f21,   56(SP)
+
+	lfd	f22,   64(SP)
+	lfd	f23,   72(SP)
+	lfd	f24,   80(SP)
+	lfd	f25,   88(SP)
+
+	lfd	f26,   96(SP)
+	lfd	f27,  104(SP)
+	lfd	f28,  112(SP)
+	lfd	f29,  120(SP)
+
+	lfd	f30,  128(SP)
+	lfd	f31,  136(SP)
+
+#ifdef __64BIT__
+	ld	r31,  144(SP)
+	ld	r30,  152(SP)
+	ld	r29,  160(SP)
+	ld	r28,  168(SP)
+	ld	r27,  176(SP)
+	ld	r26,  184(SP)
+	ld	r25,  192(SP)
+	ld	r24,  200(SP)
+	ld	r23,  208(SP)
+	ld	r22,  216(SP)
+	ld	r21,  224(SP)
+	ld	r20,  232(SP)
+	ld	r19,  240(SP)
+	ld	r18,  248(SP)
+	ld	r17,  256(SP)
+	ld	r16,  264(SP)
+	ld	r15,  272(SP)
+#else
+	lwz	r31,  144(SP)
+	lwz	r30,  148(SP)
+	lwz	r29,  152(SP)
+	lwz	r28,  156(SP)
+	lwz	r27,  160(SP)
+	lwz	r26,  164(SP)
+	lwz	r25,  168(SP)
+	lwz	r24,  172(SP)
+	lwz	r23,  176(SP)
+	lwz	r22,  180(SP)
+	lwz	r21,  184(SP)
+	lwz	r20,  188(SP)
+	lwz	r19,  192(SP)
+	lwz	r18,  196(SP)
+	lwz	r17,  200(SP)
+	lwz	r16,  204(SP)
+	lwz	r15,  208(SP)
+#endif
+
+	addi	SP, SP, STACKSIZE
+
+	blr
+
+	EPILOGUE
+#endif
diff --git a/kernel/power/dgemm_logic_16x4_power8.S b/kernel/power/dgemm_logic_16x4_power8.S
new file mode 100644
index 0000000..e19f78b
--- /dev/null
+++ b/kernel/power/dgemm_logic_16x4_power8.S
@@ -0,0 +1,1647 @@
+	srawi.		J,	N,	2
+	ble		DGEMM_L4_END
+
+DGEMM_L4_BEGIN:
+
+	mr		CO,	C
+	mr		AO,	A
+	slwi		T1,	LDC	,	2
+	add		C,	C,	T1
+	srawi.		I,	M,	4
+	ble		DGEMM_L4x16_END
+
+DGEMM_L4x16_BEGIN:
+
+
+	mr		BO,	B
+	srawi.		L,	K,	3
+	ble		DGEMM_L4x16_SUB0
+	cmpwi		cr0,	L,	1
+	ble		DGEMM_L4x16_SUB4
+
+DGEMM_L4x16_LOOP_START:
+
+	dcbt		AO,	PRE
+	LOAD4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_I1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+	dcbt		AO,	PRE
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+
+	dcbt		AO,	PRE
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+	dcbt		AO,	PRE
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+
+	addic.		L,	L,	-2
+	ble		DGEMM_L4x16_LOOP_END
+
+	.align 5
+
+DGEMM_L4x16_LOOP:
+
+	dcbt		AO,	PRE
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+	dcbt		AO,	PRE
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+
+	dcbt		AO,	PRE
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+	dcbt		AO,	PRE
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+
+	addic.		L,	L,	-1
+	bgt		DGEMM_L4x16_LOOP
+
+DGEMM_L4x16_LOOP_END:
+
+	dcbt		AO,	PRE
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+	dcbt		AO,	PRE
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+
+	dcbt		AO,	PRE
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+	dcbt		AO,	PRE
+	KERNEL4x16_1
+	KERNEL4x16_E2
+
+	b		DGEMM_L4x16_SUB1
+
+DGEMM_L4x16_SUB4:
+
+	dcbt		AO,	PRE
+	KERNEL4x16_SUBI1
+	dcbt		AO,	PRE
+	KERNEL4x16_SUB1
+	dcbt		AO,	PRE
+	KERNEL4x16_SUB1
+	dcbt		AO,	PRE
+	KERNEL4x16_SUB1
+
+	KERNEL4x16_SUB1
+	KERNEL4x16_SUB1
+	KERNEL4x16_SUB1
+	KERNEL4x16_SUB1
+
+	b		DGEMM_L4x16_SUB1
+
+DGEMM_L4x16_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL4x16_SUBI1
+
+	addic.		L,	L,	-1
+	ble		DGEMM_L4x16_SAVE
+	b		DGEMM_L4x16_SUB2
+
+DGEMM_L4x16_SUB1:
+
+	andi.		L,	K,	7
+	ble		DGEMM_L4x16_SAVE
+
+DGEMM_L4x16_SUB2:
+
+	KERNEL4x16_SUB1
+
+	addic.		L,	L,	-1
+	bgt		DGEMM_L4x16_SUB2
+
+DGEMM_L4x16_SAVE:
+
+	SAVE4x16
+
+	addic.		I,	I,	-1
+	bgt		DGEMM_L4x16_BEGIN
+
+DGEMM_L4x16_END:
+
+DGEMM_L4x8_BEGIN:
+
+	andi.		T2,	M,	15
+	ble		DGEMM_L4x1_END
+
+	andi.		T1,	M,	8
+	ble		DGEMM_L4x8_END
+	mr		BO,	B
+	srawi.		L,	K,	3
+	ble		DGEMM_L4x8_SUB0
+	cmpwi		cr0,	L,	1
+	ble		DGEMM_L4x8_SUB4
+
+DGEMM_L4x8_LOOP_START:
+
+	LOAD4x8_1
+	KERNEL4x8_I1
+	KERNEL4x8_2
+	KERNEL4x8_1
+	KERNEL4x8_2
+
+	KERNEL4x8_1
+	KERNEL4x8_2
+	KERNEL4x8_1
+	KERNEL4x8_2
+
+	addic.		L,	L,	-2
+	ble		DGEMM_L4x8_LOOP_END
+
+	.align 5
+
+DGEMM_L4x8_LOOP:
+
+	KERNEL4x8_1
+	KERNEL4x8_2
+	KERNEL4x8_1
+	KERNEL4x8_2
+
+	KERNEL4x8_1
+	KERNEL4x8_2
+	KERNEL4x8_1
+	KERNEL4x8_2
+
+	addic.		L,	L,	-1
+	bgt		DGEMM_L4x8_LOOP
+
+DGEMM_L4x8_LOOP_END:
+
+	KERNEL4x8_1
+	KERNEL4x8_2
+	KERNEL4x8_1
+	KERNEL4x8_2
+
+	KERNEL4x8_1
+	KERNEL4x8_2
+	KERNEL4x8_1
+	KERNEL4x8_E2
+
+	b		DGEMM_L4x8_SUB1
+
+DGEMM_L4x8_SUB4:
+
+	KERNEL4x8_SUBI1
+	KERNEL4x8_SUB1
+	KERNEL4x8_SUB1
+	KERNEL4x8_SUB1
+
+	KERNEL4x8_SUB1
+	KERNEL4x8_SUB1
+	KERNEL4x8_SUB1
+	KERNEL4x8_SUB1
+
+	b		DGEMM_L4x8_SUB1
+
+DGEMM_L4x8_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL4x8_SUBI1
+
+	addic.		L,	L,	-1
+	ble		DGEMM_L4x8_SAVE
+	b		DGEMM_L4x8_SUB2
+
+DGEMM_L4x8_SUB1:
+
+	andi.		L,	K,	7
+	ble		DGEMM_L4x8_SAVE
+
+DGEMM_L4x8_SUB2:
+
+	KERNEL4x8_SUB1
+
+	addic.		L,	L,	-1
+	bgt		DGEMM_L4x8_SUB2
+
+DGEMM_L4x8_SAVE:
+
+	SAVE4x8
+
+DGEMM_L4x8_END:
+
+DGEMM_L4x4_BEGIN:
+
+
+	andi.		T1,	M,	4
+	ble		DGEMM_L4x4_END
+	mr		BO,	B
+	srawi.		L,	K,	3
+	ble		DGEMM_L4x4_SUB0
+	cmpwi		cr0,	L,	1
+	ble		DGEMM_L4x4_SUB4
+
+DGEMM_L4x4_LOOP_START:
+
+	LOAD4x4_1
+	KERNEL4x4_I1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	KERNEL4x4_2
+
+	KERNEL4x4_1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	KERNEL4x4_2
+
+	addic.		L,	L,	-2
+	ble		DGEMM_L4x4_LOOP_END
+
+	.align 5
+
+DGEMM_L4x4_LOOP:
+
+	KERNEL4x4_1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	KERNEL4x4_2
+
+	KERNEL4x4_1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	KERNEL4x4_2
+
+	addic.		L,	L,	-1
+	bgt		DGEMM_L4x4_LOOP
+
+DGEMM_L4x4_LOOP_END:
+
+	KERNEL4x4_1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	KERNEL4x4_2
+
+	KERNEL4x4_1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	KERNEL4x4_E2
+
+	b		DGEMM_L4x4_SUB1
+
+DGEMM_L4x4_SUB4:
+
+	KERNEL4x4_SUBI1
+	KERNEL4x4_SUB1
+	KERNEL4x4_SUB1
+	KERNEL4x4_SUB1
+
+	KERNEL4x4_SUB1
+	KERNEL4x4_SUB1
+	KERNEL4x4_SUB1
+	KERNEL4x4_SUB1
+
+	b		DGEMM_L4x4_SUB1
+
+DGEMM_L4x4_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL4x4_SUBI1
+
+	addic.		L,	L,	-1
+	ble		DGEMM_L4x4_SAVE
+	b		DGEMM_L4x4_SUB2
+
+DGEMM_L4x4_SUB1:
+
+	andi.		L,	K,	7
+	ble		DGEMM_L4x4_SAVE
+
+DGEMM_L4x4_SUB2:
+
+	KERNEL4x4_SUB1
+
+	addic.		L,	L,	-1
+	bgt		DGEMM_L4x4_SUB2
+
+DGEMM_L4x4_SAVE:
+
+	SAVE4x4
+
+DGEMM_L4x4_END:
+
+DGEMM_L4x2_BEGIN:
+
+
+	andi.		T1,	M,	2
+	ble		DGEMM_L4x2_END
+	mr		BO,	B
+	srawi.		L,	K,	3
+	ble		DGEMM_L4x2_SUB0
+	cmpwi		cr0,	L,	1
+	ble		DGEMM_L4x2_SUB4
+
+DGEMM_L4x2_LOOP_START:
+
+	LOAD4x2_1
+	KERNEL4x2_I1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_2
+
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_2
+
+	addic.		L,	L,	-2
+	ble		DGEMM_L4x2_LOOP_END
+
+	.align 5
+
+DGEMM_L4x2_LOOP:
+
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_2
+
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_2
+
+	addic.		L,	L,	-1
+	bgt		DGEMM_L4x2_LOOP
+
+DGEMM_L4x2_LOOP_END:
+
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_2
+
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_E2
+
+	b		DGEMM_L4x2_SUB1
+
+DGEMM_L4x2_SUB4:
+
+	KERNEL4x2_SUBI1
+	KERNEL4x2_SUB1
+	KERNEL4x2_SUB1
+	KERNEL4x2_SUB1
+
+	KERNEL4x2_SUB1
+	KERNEL4x2_SUB1
+	KERNEL4x2_SUB1
+	KERNEL4x2_SUB1
+
+	b		DGEMM_L4x2_SUB1
+
+DGEMM_L4x2_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL4x2_SUBI1
+
+	addic.		L,	L,	-1
+	ble		DGEMM_L4x2_SAVE
+	b		DGEMM_L4x2_SUB2
+
+DGEMM_L4x2_SUB1:
+
+	andi.		L,	K,	7
+	ble		DGEMM_L4x2_SAVE
+
+DGEMM_L4x2_SUB2:
+
+	KERNEL4x2_SUB1
+
+	addic.		L,	L,	-1
+	bgt		DGEMM_L4x2_SUB2
+
+DGEMM_L4x2_SAVE:
+
+	SAVE4x2
+
+DGEMM_L4x2_END:
+
+DGEMM_L4x1_BEGIN:
+
+
+	andi.		T1,	M,	1
+	ble		DGEMM_L4x1_END
+	mr		BO,	B
+	srawi.		L,	K,	3
+	ble		DGEMM_L4x1_SUB0
+	cmpwi		cr0,	L,	1
+	ble		DGEMM_L4x1_SUB4
+
+DGEMM_L4x1_LOOP_START:
+
+	LOAD4x1_1
+	KERNEL4x1_I1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_2
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_2
+
+	addic.		L,	L,	-2
+	ble		DGEMM_L4x1_LOOP_END
+
+	.align 5
+
+DGEMM_L4x1_LOOP:
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_2
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_2
+
+	addic.		L,	L,	-1
+	bgt		DGEMM_L4x1_LOOP
+
+DGEMM_L4x1_LOOP_END:
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_2
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_E2
+
+	b		DGEMM_L4x1_SUB1
+
+DGEMM_L4x1_SUB4:
+
+	KERNEL4x1_SUBI1
+	KERNEL4x1_SUB1
+	KERNEL4x1_SUB1
+	KERNEL4x1_SUB1
+
+	KERNEL4x1_SUB1
+	KERNEL4x1_SUB1
+	KERNEL4x1_SUB1
+	KERNEL4x1_SUB1
+
+	b		DGEMM_L4x1_SUB1
+
+DGEMM_L4x1_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL4x1_SUBI1
+
+	addic.		L,	L,	-1
+	ble		DGEMM_L4x1_SAVE
+	b		DGEMM_L4x1_SUB2
+
+DGEMM_L4x1_SUB1:
+
+	andi.		L,	K,	7
+	ble		DGEMM_L4x1_SAVE
+
+DGEMM_L4x1_SUB2:
+
+	KERNEL4x1_SUB1
+
+	addic.		L,	L,	-1
+	bgt		DGEMM_L4x1_SUB2
+
+DGEMM_L4x1_SAVE:
+
+	SAVE4x1
+
+DGEMM_L4x1_END:
+
+	slwi		T1,	K,	5
+	add		B,	B,	T1
+
+	addic.		J,	J,	-1
+	bgt		DGEMM_L4_BEGIN
+
+	andi.		T2,	N,	3
+	ble		L999
+
+DGEMM_L4_END:
+
+	b		DGEMM_L2_BEGIN
+
+L999_H1:
+
+	b		L999
+
+DGEMM_L2_BEGIN:
+
+	andi.		T1,	N,	2
+	ble		DGEMM_L2_END
+	mr		CO,	C
+	mr		AO,	A
+	slwi		T1,	LDC	,	1
+	add		C,	C,	T1
+	srawi.		I,	M,	4
+	ble		DGEMM_L2x16_END
+
+DGEMM_L2x16_BEGIN:
+
+
+	mr		BO,	B
+	srawi.		L,	K,	3
+	ble		DGEMM_L2x16_SUB0
+	cmpwi		cr0,	L,	1
+	ble		DGEMM_L2x16_SUB4
+
+DGEMM_L2x16_LOOP_START:
+
+	dcbt		AO,	PRE
+	LOAD2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_I1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+	dcbt		AO,	PRE
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+
+	dcbt		AO,	PRE
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+	dcbt		AO,	PRE
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+
+	addic.		L,	L,	-2
+	ble		DGEMM_L2x16_LOOP_END
+
+	.align 5
+
+DGEMM_L2x16_LOOP:
+
+	dcbt		AO,	PRE
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+	dcbt		AO,	PRE
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+
+	dcbt		AO,	PRE
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+	dcbt		AO,	PRE
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+
+	addic.		L,	L,	-1
+	bgt		DGEMM_L2x16_LOOP
+
+DGEMM_L2x16_LOOP_END:
+
+	dcbt		AO,	PRE
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+	dcbt		AO,	PRE
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+
+	dcbt		AO,	PRE
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+	dcbt		AO,	PRE
+	KERNEL2x16_1
+	KERNEL2x16_E2
+
+	b		DGEMM_L2x16_SUB1
+
+DGEMM_L2x16_SUB4:
+
+	dcbt		AO,	PRE
+	KERNEL2x16_SUBI1
+	dcbt		AO,	PRE
+	KERNEL2x16_SUB1
+	dcbt		AO,	PRE
+	KERNEL2x16_SUB1
+	dcbt		AO,	PRE
+	KERNEL2x16_SUB1
+
+	KERNEL2x16_SUB1
+	KERNEL2x16_SUB1
+	KERNEL2x16_SUB1
+	KERNEL2x16_SUB1
+
+	b		DGEMM_L2x16_SUB1
+
+DGEMM_L2x16_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL2x16_SUBI1
+
+	addic.		L,	L,	-1
+	ble		DGEMM_L2x16_SAVE
+	b		DGEMM_L2x16_SUB2
+
+DGEMM_L2x16_SUB1:
+
+	andi.		L,	K,	7
+	ble		DGEMM_L2x16_SAVE
+
+DGEMM_L2x16_SUB2:
+
+	KERNEL2x16_SUB1
+
+	addic.		L,	L,	-1
+	bgt		DGEMM_L2x16_SUB2
+
+DGEMM_L2x16_SAVE:
+
+	SAVE2x16
+
+	addic.		I,	I,	-1
+	bgt		DGEMM_L2x16_BEGIN
+
+DGEMM_L2x16_END:
+
+DGEMM_L2x8_BEGIN:
+
+	andi.		T2,	M,	15
+	ble		DGEMM_L2x1_END
+
+	andi.		T1,	M,	8
+	ble		DGEMM_L2x8_END
+	mr		BO,	B
+	srawi.		L,	K,	3
+	ble		DGEMM_L2x8_SUB0
+	cmpwi		cr0,	L,	1
+	ble		DGEMM_L2x8_SUB4
+
+DGEMM_L2x8_LOOP_START:
+
+	LOAD2x8_1
+	KERNEL2x8_I1
+	KERNEL2x8_2
+	KERNEL2x8_1
+	KERNEL2x8_2
+
+	KERNEL2x8_1
+	KERNEL2x8_2
+	KERNEL2x8_1
+	KERNEL2x8_2
+
+	addic.		L,	L,	-2
+	ble		DGEMM_L2x8_LOOP_END
+
+	.align 5
+
+DGEMM_L2x8_LOOP:
+
+	KERNEL2x8_1
+	KERNEL2x8_2
+	KERNEL2x8_1
+	KERNEL2x8_2
+
+	KERNEL2x8_1
+	KERNEL2x8_2
+	KERNEL2x8_1
+	KERNEL2x8_2
+
+	addic.		L,	L,	-1
+	bgt		DGEMM_L2x8_LOOP
+
+DGEMM_L2x8_LOOP_END:
+
+	KERNEL2x8_1
+	KERNEL2x8_2
+	KERNEL2x8_1
+	KERNEL2x8_2
+
+	KERNEL2x8_1
+	KERNEL2x8_2
+	KERNEL2x8_1
+	KERNEL2x8_E2
+
+	b		DGEMM_L2x8_SUB1
+
+DGEMM_L2x8_SUB4:
+
+	KERNEL2x8_SUBI1
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+
+	b		DGEMM_L2x8_SUB1
+
+DGEMM_L2x8_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL2x8_SUBI1
+
+	addic.		L,	L,	-1
+	ble		DGEMM_L2x8_SAVE
+	b		DGEMM_L2x8_SUB2
+
+DGEMM_L2x8_SUB1:
+
+	andi.		L,	K,	7
+	ble		DGEMM_L2x8_SAVE
+
+DGEMM_L2x8_SUB2:
+
+	KERNEL2x8_SUB1
+
+	addic.		L,	L,	-1
+	bgt		DGEMM_L2x8_SUB2
+
+DGEMM_L2x8_SAVE:
+
+	SAVE2x8
+
+DGEMM_L2x8_END:
+
+DGEMM_L2x4_BEGIN:
+
+
+	andi.		T1,	M,	4
+	ble		DGEMM_L2x4_END
+	mr		BO,	B
+	srawi.		L,	K,	3
+	ble		DGEMM_L2x4_SUB0
+	cmpwi		cr0,	L,	1
+	ble		DGEMM_L2x4_SUB4
+
+DGEMM_L2x4_LOOP_START:
+
+	LOAD2x4_1
+	KERNEL2x4_I1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	addic.		L,	L,	-2
+	ble		DGEMM_L2x4_LOOP_END
+
+	.align 5
+
+DGEMM_L2x4_LOOP:
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	addic.		L,	L,	-1
+	bgt		DGEMM_L2x4_LOOP
+
+DGEMM_L2x4_LOOP_END:
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_E2
+
+	b		DGEMM_L2x4_SUB1
+
+DGEMM_L2x4_SUB4:
+
+	KERNEL2x4_SUBI1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+
+	b		DGEMM_L2x4_SUB1
+
+DGEMM_L2x4_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL2x4_SUBI1
+
+	addic.		L,	L,	-1
+	ble		DGEMM_L2x4_SAVE
+	b		DGEMM_L2x4_SUB2
+
+DGEMM_L2x4_SUB1:
+
+	andi.		L,	K,	7
+	ble		DGEMM_L2x4_SAVE
+
+DGEMM_L2x4_SUB2:
+
+	KERNEL2x4_SUB1
+
+	addic.		L,	L,	-1
+	bgt		DGEMM_L2x4_SUB2
+
+DGEMM_L2x4_SAVE:
+
+	SAVE2x4
+
+DGEMM_L2x4_END:
+
+DGEMM_L2x2_BEGIN:
+
+
+	andi.		T1,	M,	2
+	ble		DGEMM_L2x2_END
+	mr		BO,	B
+	srawi.		L,	K,	3
+	ble		DGEMM_L2x2_SUB0
+	cmpwi		cr0,	L,	1
+	ble		DGEMM_L2x2_SUB4
+
+DGEMM_L2x2_LOOP_START:
+
+	LOAD2x2_1
+	KERNEL2x2_I1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	addic.		L,	L,	-2
+	ble		DGEMM_L2x2_LOOP_END
+
+	.align 5
+
+DGEMM_L2x2_LOOP:
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	addic.		L,	L,	-1
+	bgt		DGEMM_L2x2_LOOP
+
+DGEMM_L2x2_LOOP_END:
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_E2
+
+	b		DGEMM_L2x2_SUB1
+
+DGEMM_L2x2_SUB4:
+
+	KERNEL2x2_SUBI1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+
+	b		DGEMM_L2x2_SUB1
+
+DGEMM_L2x2_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL2x2_SUBI1
+
+	addic.		L,	L,	-1
+	ble		DGEMM_L2x2_SAVE
+	b		DGEMM_L2x2_SUB2
+
+DGEMM_L2x2_SUB1:
+
+	andi.		L,	K,	7
+	ble		DGEMM_L2x2_SAVE
+
+DGEMM_L2x2_SUB2:
+
+	KERNEL2x2_SUB1
+
+	addic.		L,	L,	-1
+	bgt		DGEMM_L2x2_SUB2
+
+DGEMM_L2x2_SAVE:
+
+	SAVE2x2
+
+DGEMM_L2x2_END:
+
+DGEMM_L2x1_BEGIN:
+
+
+	andi.		T1,	M,	1
+	ble		DGEMM_L2x1_END
+	mr		BO,	B
+	srawi.		L,	K,	3
+	ble		DGEMM_L2x1_SUB0
+	cmpwi		cr0,	L,	1
+	ble		DGEMM_L2x1_SUB4
+
+DGEMM_L2x1_LOOP_START:
+
+	LOAD2x1_1
+	KERNEL2x1_I1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	addic.		L,	L,	-2
+	ble		DGEMM_L2x1_LOOP_END
+
+	.align 5
+
+DGEMM_L2x1_LOOP:
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	addic.		L,	L,	-1
+	bgt		DGEMM_L2x1_LOOP
+
+DGEMM_L2x1_LOOP_END:
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_E2
+
+	b		DGEMM_L2x1_SUB1
+
+DGEMM_L2x1_SUB4:
+
+	KERNEL2x1_SUBI1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+
+	b		DGEMM_L2x1_SUB1
+
+DGEMM_L2x1_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL2x1_SUBI1
+
+	addic.		L,	L,	-1
+	ble		DGEMM_L2x1_SAVE
+	b		DGEMM_L2x1_SUB2
+
+DGEMM_L2x1_SUB1:
+
+	andi.		L,	K,	7
+	ble		DGEMM_L2x1_SAVE
+
+DGEMM_L2x1_SUB2:
+
+	KERNEL2x1_SUB1
+
+	addic.		L,	L,	-1
+	bgt		DGEMM_L2x1_SUB2
+
+DGEMM_L2x1_SAVE:
+
+	SAVE2x1
+
+DGEMM_L2x1_END:
+
+	slwi		T1,	K,	4
+	add		B,	B,	T1
+
+DGEMM_L2_END:
+DGEMM_L1_BEGIN:
+
+	andi.		T1,	N,	1
+	ble		DGEMM_L1_END
+	mr		CO,	C
+	mr		AO,	A
+	srawi.		I,	M,	4
+	ble		DGEMM_L1x16_END
+
+DGEMM_L1x16_BEGIN:
+
+
+	mr		BO,	B
+	srawi.		L,	K,	3
+	ble		DGEMM_L1x16_SUB0
+	cmpwi		cr0,	L,	1
+	ble		DGEMM_L1x16_SUB4
+
+DGEMM_L1x16_LOOP_START:
+
+	dcbt		AO,	PRE
+	LOAD1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_I1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+	dcbt		AO,	PRE
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+
+	dcbt		AO,	PRE
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+	dcbt		AO,	PRE
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+
+	addic.		L,	L,	-2
+	ble		DGEMM_L1x16_LOOP_END
+
+	.align 5
+
+DGEMM_L1x16_LOOP:
+
+	dcbt		AO,	PRE
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+	dcbt		AO,	PRE
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+
+	dcbt		AO,	PRE
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+	dcbt		AO,	PRE
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+
+	addic.		L,	L,	-1
+	bgt		DGEMM_L1x16_LOOP
+
+DGEMM_L1x16_LOOP_END:
+
+	dcbt		AO,	PRE
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+	dcbt		AO,	PRE
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+
+	dcbt		AO,	PRE
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+	dcbt		AO,	PRE
+	KERNEL1x16_1
+	KERNEL1x16_E2
+
+	b		DGEMM_L1x16_SUB1
+
+DGEMM_L1x16_SUB4:
+
+	dcbt		AO,	PRE
+	KERNEL1x16_SUBI1
+	dcbt		AO,	PRE
+	KERNEL1x16_SUB1
+	dcbt		AO,	PRE
+	KERNEL1x16_SUB1
+	dcbt		AO,	PRE
+	KERNEL1x16_SUB1
+
+	KERNEL1x16_SUB1
+	KERNEL1x16_SUB1
+	KERNEL1x16_SUB1
+	KERNEL1x16_SUB1
+
+	b		DGEMM_L1x16_SUB1
+
+DGEMM_L1x16_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL1x16_SUBI1
+
+	addic.		L,	L,	-1
+	ble		DGEMM_L1x16_SAVE
+	b		DGEMM_L1x16_SUB2
+
+DGEMM_L1x16_SUB1:
+
+	andi.		L,	K,	7
+	ble		DGEMM_L1x16_SAVE
+
+DGEMM_L1x16_SUB2:
+
+	KERNEL1x16_SUB1
+
+	addic.		L,	L,	-1
+	bgt		DGEMM_L1x16_SUB2
+
+DGEMM_L1x16_SAVE:
+
+	SAVE1x16
+
+	addic.		I,	I,	-1
+	bgt		DGEMM_L1x16_BEGIN
+
+DGEMM_L1x16_END:
+
+DGEMM_L1x8_BEGIN:
+
+	andi.		T2,	M,	15
+	ble		DGEMM_L1x1_END
+
+	andi.		T1,	M,	8
+	ble		DGEMM_L1x8_END
+	mr		BO,	B
+	srawi.		L,	K,	3
+	ble		DGEMM_L1x8_SUB0
+	cmpwi		cr0,	L,	1
+	ble		DGEMM_L1x8_SUB4
+
+DGEMM_L1x8_LOOP_START:
+
+	LOAD1x8_1
+	KERNEL1x8_I1
+	KERNEL1x8_2
+	KERNEL1x8_1
+	KERNEL1x8_2
+
+	KERNEL1x8_1
+	KERNEL1x8_2
+	KERNEL1x8_1
+	KERNEL1x8_2
+
+	addic.		L,	L,	-2
+	ble		DGEMM_L1x8_LOOP_END
+
+	.align 5
+
+DGEMM_L1x8_LOOP:
+
+	KERNEL1x8_1
+	KERNEL1x8_2
+	KERNEL1x8_1
+	KERNEL1x8_2
+
+	KERNEL1x8_1
+	KERNEL1x8_2
+	KERNEL1x8_1
+	KERNEL1x8_2
+
+	addic.		L,	L,	-1
+	bgt		DGEMM_L1x8_LOOP
+
+DGEMM_L1x8_LOOP_END:
+
+	KERNEL1x8_1
+	KERNEL1x8_2
+	KERNEL1x8_1
+	KERNEL1x8_2
+
+	KERNEL1x8_1
+	KERNEL1x8_2
+	KERNEL1x8_1
+	KERNEL1x8_E2
+
+	b		DGEMM_L1x8_SUB1
+
+DGEMM_L1x8_SUB4:
+
+	KERNEL1x8_SUBI1
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+
+	b		DGEMM_L1x8_SUB1
+
+DGEMM_L1x8_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL1x8_SUBI1
+
+	addic.		L,	L,	-1
+	ble		DGEMM_L1x8_SAVE
+	b		DGEMM_L1x8_SUB2
+
+DGEMM_L1x8_SUB1:
+
+	andi.		L,	K,	7
+	ble		DGEMM_L1x8_SAVE
+
+DGEMM_L1x8_SUB2:
+
+	KERNEL1x8_SUB1
+
+	addic.		L,	L,	-1
+	bgt		DGEMM_L1x8_SUB2
+
+DGEMM_L1x8_SAVE:
+
+	SAVE1x8
+
+DGEMM_L1x8_END:
+
+DGEMM_L1x4_BEGIN:
+
+
+	andi.		T1,	M,	4
+	ble		DGEMM_L1x4_END
+	mr		BO,	B
+	srawi.		L,	K,	3
+	ble		DGEMM_L1x4_SUB0
+	cmpwi		cr0,	L,	1
+	ble		DGEMM_L1x4_SUB4
+
+DGEMM_L1x4_LOOP_START:
+
+	LOAD1x4_1
+	KERNEL1x4_I1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	addic.		L,	L,	-2
+	ble		DGEMM_L1x4_LOOP_END
+
+	.align 5
+
+DGEMM_L1x4_LOOP:
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	addic.		L,	L,	-1
+	bgt		DGEMM_L1x4_LOOP
+
+DGEMM_L1x4_LOOP_END:
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_E2
+
+	b		DGEMM_L1x4_SUB1
+
+DGEMM_L1x4_SUB4:
+
+	KERNEL1x4_SUBI1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+
+	b		DGEMM_L1x4_SUB1
+
+DGEMM_L1x4_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL1x4_SUBI1
+
+	addic.		L,	L,	-1
+	ble		DGEMM_L1x4_SAVE
+	b		DGEMM_L1x4_SUB2
+
+DGEMM_L1x4_SUB1:
+
+	andi.		L,	K,	7
+	ble		DGEMM_L1x4_SAVE
+
+DGEMM_L1x4_SUB2:
+
+	KERNEL1x4_SUB1
+
+	addic.		L,	L,	-1
+	bgt		DGEMM_L1x4_SUB2
+
+DGEMM_L1x4_SAVE:
+
+	SAVE1x4
+
+DGEMM_L1x4_END:
+
+DGEMM_L1x2_BEGIN:
+
+
+	andi.		T1,	M,	2
+	ble		DGEMM_L1x2_END
+	mr		BO,	B
+	srawi.		L,	K,	3
+	ble		DGEMM_L1x2_SUB0
+	cmpwi		cr0,	L,	1
+	ble		DGEMM_L1x2_SUB4
+
+DGEMM_L1x2_LOOP_START:
+
+	LOAD1x2_1
+	KERNEL1x2_I1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	addic.		L,	L,	-2
+	ble		DGEMM_L1x2_LOOP_END
+
+	.align 5
+
+DGEMM_L1x2_LOOP:
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	addic.		L,	L,	-1
+	bgt		DGEMM_L1x2_LOOP
+
+DGEMM_L1x2_LOOP_END:
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_E2
+
+	b		DGEMM_L1x2_SUB1
+
+DGEMM_L1x2_SUB4:
+
+	KERNEL1x2_SUBI1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+
+	b		DGEMM_L1x2_SUB1
+
+DGEMM_L1x2_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL1x2_SUBI1
+
+	addic.		L,	L,	-1
+	ble		DGEMM_L1x2_SAVE
+	b		DGEMM_L1x2_SUB2
+
+DGEMM_L1x2_SUB1:
+
+	andi.		L,	K,	7
+	ble		DGEMM_L1x2_SAVE
+
+DGEMM_L1x2_SUB2:
+
+	KERNEL1x2_SUB1
+
+	addic.		L,	L,	-1
+	bgt		DGEMM_L1x2_SUB2
+
+DGEMM_L1x2_SAVE:
+
+	SAVE1x2
+
+DGEMM_L1x2_END:
+
+DGEMM_L1x1_BEGIN:
+
+
+	andi.		T1,	M,	1
+	ble		DGEMM_L1x1_END
+	mr		BO,	B
+	srawi.		L,	K,	3
+	ble		DGEMM_L1x1_SUB0
+	cmpwi		cr0,	L,	1
+	ble		DGEMM_L1x1_SUB4
+
+DGEMM_L1x1_LOOP_START:
+
+	LOAD1x1_1
+	KERNEL1x1_I1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	addic.		L,	L,	-2
+	ble		DGEMM_L1x1_LOOP_END
+
+	.align 5
+
+DGEMM_L1x1_LOOP:
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	addic.		L,	L,	-1
+	bgt		DGEMM_L1x1_LOOP
+
+DGEMM_L1x1_LOOP_END:
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_E2
+
+	b		DGEMM_L1x1_SUB1
+
+DGEMM_L1x1_SUB4:
+
+	KERNEL1x1_SUBI1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+
+	b		DGEMM_L1x1_SUB1
+
+DGEMM_L1x1_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL1x1_SUBI1
+
+	addic.		L,	L,	-1
+	ble		DGEMM_L1x1_SAVE
+	b		DGEMM_L1x1_SUB2
+
+DGEMM_L1x1_SUB1:
+
+	andi.		L,	K,	7
+	ble		DGEMM_L1x1_SAVE
+
+DGEMM_L1x1_SUB2:
+
+	KERNEL1x1_SUB1
+
+	addic.		L,	L,	-1
+	bgt		DGEMM_L1x1_SUB2
+
+DGEMM_L1x1_SAVE:
+
+	SAVE1x1
+
+DGEMM_L1x1_END:
+
+DGEMM_L1_END:
diff --git a/kernel/power/dgemm_macros_16x4_power8.S b/kernel/power/dgemm_macros_16x4_power8.S
new file mode 100644
index 0000000..d409098
--- /dev/null
+++ b/kernel/power/dgemm_macros_16x4_power8.S
@@ -0,0 +1,3400 @@
+/*********************************************************************
+* Macros for N=4, M=16                                               *
+*********************************************************************/
+
+.macro LOAD4x16_1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 64
+
+	lxvd2x	vs4,	0,	AO
+	lxvd2x	vs5,	o16,	AO
+	lxvd2x	vs6,	o32,	AO
+	lxvd2x	vs7,	o48,	AO
+
+	lxvdsx	vs26,	o16,	BO
+	lxvdsx	vs27,	o24,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 32
+
+.endm
+
+.macro KERNEL4x16_I1
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+	xvmuldp			vs34,	vs2,	vs24
+	xvmuldp			vs35,	vs3,	vs24
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+
+	xvmuldp			vs36,	vs4,	vs24
+	xvmuldp			vs37,	vs5,	vs24
+	xvmuldp			vs38,	vs6,	vs24
+	xvmuldp			vs39,	vs7,	vs24
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+
+	xvmuldp			vs40,	vs0,	vs25
+	xvmuldp			vs41,	vs1,	vs25
+	xvmuldp			vs42,	vs2,	vs25
+	xvmuldp			vs43,	vs3,	vs25
+
+	lxvd2x	vs10,	o32,	AO
+	lxvd2x	vs11,	o48,	AO
+
+	xvmuldp			vs44,	vs4,	vs25
+	xvmuldp			vs45,	vs5,	vs25
+	xvmuldp			vs46,	vs6,	vs25
+	xvmuldp			vs47,	vs7,	vs25
+
+	addi		AO, AO, 64
+
+	xvmuldp			vs48,	vs0,	vs26
+	xvmuldp			vs49,	vs1,	vs26
+	xvmuldp			vs50,	vs2,	vs26
+	xvmuldp			vs51,	vs3,	vs26
+
+	lxvd2x	vs12,	0,	AO
+	lxvd2x	vs13,	o16,	AO
+
+	xvmuldp			vs52,	vs4,	vs26
+	xvmuldp			vs53,	vs5,	vs26
+	xvmuldp			vs54,	vs6,	vs26
+	xvmuldp			vs55,	vs7,	vs26
+
+	lxvd2x	vs14,	o32,	AO
+	lxvd2x	vs15,	o48,	AO
+
+	xvmuldp			vs56,	vs0,	vs27
+	xvmuldp			vs57,	vs1,	vs27
+	xvmuldp			vs58,	vs2,	vs27
+	xvmuldp			vs59,	vs3,	vs27
+
+	lxvdsx	vs30,	o16,	BO
+	lxvdsx	vs31,	o24,	BO
+
+	xvmuldp			vs60,	vs4,	vs27
+	xvmuldp			vs61,	vs5,	vs27
+	xvmuldp			vs62,	vs6,	vs27
+	xvmuldp			vs63,	vs7,	vs27
+
+	addi		AO, AO, 64
+	addi		BO, BO, 32
+
+.endm
+
+.macro KERNEL4x16_1
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+
+	xvmaddadp		vs36,	vs4,	vs24
+	xvmaddadp		vs37,	vs5,	vs24
+	xvmaddadp		vs38,	vs6,	vs24
+	xvmaddadp		vs39,	vs7,	vs24
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+	xvmaddadp		vs42,	vs2,	vs25
+	xvmaddadp		vs43,	vs3,	vs25
+
+	lxvd2x	vs10,	o32,	AO
+	lxvd2x	vs11,	o48,	AO
+
+	xvmaddadp		vs44,	vs4,	vs25
+	xvmaddadp		vs45,	vs5,	vs25
+	xvmaddadp		vs46,	vs6,	vs25
+	xvmaddadp		vs47,	vs7,	vs25
+
+	addi		AO, AO, 64
+
+	xvmaddadp		vs48,	vs0,	vs26
+	xvmaddadp		vs49,	vs1,	vs26
+	xvmaddadp		vs50,	vs2,	vs26
+	xvmaddadp		vs51,	vs3,	vs26
+
+	lxvd2x	vs12,	0,	AO
+	lxvd2x	vs13,	o16,	AO
+
+	xvmaddadp		vs52,	vs4,	vs26
+	xvmaddadp		vs53,	vs5,	vs26
+	xvmaddadp		vs54,	vs6,	vs26
+	xvmaddadp		vs55,	vs7,	vs26
+
+	lxvd2x	vs14,	o32,	AO
+	lxvd2x	vs15,	o48,	AO
+
+	xvmaddadp		vs56,	vs0,	vs27
+	xvmaddadp		vs57,	vs1,	vs27
+	xvmaddadp		vs58,	vs2,	vs27
+	xvmaddadp		vs59,	vs3,	vs27
+
+
+	lxvdsx	vs30,	o16,	BO
+	lxvdsx	vs31,	o24,	BO
+
+	xvmaddadp		vs60,	vs4,	vs27
+	xvmaddadp		vs61,	vs5,	vs27
+	xvmaddadp		vs62,	vs6,	vs27
+	xvmaddadp		vs63,	vs7,	vs27
+
+	addi		AO, AO, 64
+	addi		BO, BO, 32
+
+.endm
+
+.macro KERNEL4x16_2
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+	xvmaddadp		vs34,	vs10,	vs28
+	xvmaddadp		vs35,	vs11,	vs28
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	xvmaddadp		vs36,	vs12,	vs28
+	xvmaddadp		vs37,	vs13,	vs28
+	xvmaddadp		vs38,	vs14,	vs28
+	xvmaddadp		vs39,	vs15,	vs28
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	xvmaddadp		vs40,	vs8,	vs29
+	xvmaddadp		vs41,	vs9,	vs29
+	xvmaddadp		vs42,	vs10,	vs29
+	xvmaddadp		vs43,	vs11,	vs29
+
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	xvmaddadp		vs44,	vs12,	vs29
+	xvmaddadp		vs45,	vs13,	vs29
+	xvmaddadp		vs46,	vs14,	vs29
+	xvmaddadp		vs47,	vs15,	vs29
+
+	addi		AO, AO, 64
+
+	xvmaddadp		vs48,	vs8,	vs30
+	xvmaddadp		vs49,	vs9,	vs30
+	xvmaddadp		vs50,	vs10,	vs30
+	xvmaddadp		vs51,	vs11,	vs30
+
+	lxvd2x	vs4,	0,	AO
+	lxvd2x	vs5,	o16,	AO
+
+	xvmaddadp		vs52,	vs12,	vs30
+	xvmaddadp		vs53,	vs13,	vs30
+	xvmaddadp		vs54,	vs14,	vs30
+	xvmaddadp		vs55,	vs15,	vs30
+
+	lxvd2x	vs6,	o32,	AO
+	lxvd2x	vs7,	o48,	AO
+
+	xvmaddadp		vs56,	vs8,	vs31
+	xvmaddadp		vs57,	vs9,	vs31
+	xvmaddadp		vs58,	vs10,	vs31
+	xvmaddadp		vs59,	vs11,	vs31
+
+	lxvdsx	vs26,	o16,	BO
+	lxvdsx	vs27,	o24,	BO
+
+	xvmaddadp		vs60,	vs12,	vs31
+	xvmaddadp		vs61,	vs13,	vs31
+	xvmaddadp		vs62,	vs14,	vs31
+	xvmaddadp		vs63,	vs15,	vs31
+
+	addi		AO, AO, 64
+	addi		BO, BO, 32
+
+.endm
+
+.macro KERNEL4x16_E2
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+	xvmaddadp		vs34,	vs10,	vs28
+	xvmaddadp		vs35,	vs11,	vs28
+	xvmaddadp		vs36,	vs12,	vs28
+	xvmaddadp		vs37,	vs13,	vs28
+	xvmaddadp		vs38,	vs14,	vs28
+	xvmaddadp		vs39,	vs15,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+	xvmaddadp		vs41,	vs9,	vs29
+	xvmaddadp		vs42,	vs10,	vs29
+	xvmaddadp		vs43,	vs11,	vs29
+	xvmaddadp		vs44,	vs12,	vs29
+	xvmaddadp		vs45,	vs13,	vs29
+	xvmaddadp		vs46,	vs14,	vs29
+	xvmaddadp		vs47,	vs15,	vs29
+
+	xvmaddadp		vs48,	vs8,	vs30
+	xvmaddadp		vs49,	vs9,	vs30
+	xvmaddadp		vs50,	vs10,	vs30
+	xvmaddadp		vs51,	vs11,	vs30
+	xvmaddadp		vs52,	vs12,	vs30
+	xvmaddadp		vs53,	vs13,	vs30
+	xvmaddadp		vs54,	vs14,	vs30
+	xvmaddadp		vs55,	vs15,	vs30
+
+	xvmaddadp		vs56,	vs8,	vs31
+	xvmaddadp		vs57,	vs9,	vs31
+	xvmaddadp		vs58,	vs10,	vs31
+	xvmaddadp		vs59,	vs11,	vs31
+	xvmaddadp		vs60,	vs12,	vs31
+	xvmaddadp		vs61,	vs13,	vs31
+	xvmaddadp		vs62,	vs14,	vs31
+	xvmaddadp		vs63,	vs15,	vs31
+
+.endm
+
+.macro KERNEL4x16_SUBI1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+	lxvdsx	vs26,	o16,	BO
+	lxvdsx	vs27,	o24,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 32
+
+	lxvd2x	vs4,	0,	AO
+	lxvd2x	vs5,	o16,	AO
+	lxvd2x	vs6,	o32,	AO
+	lxvd2x	vs7,	o48,	AO
+
+	addi		AO, AO, 64
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+	xvmuldp			vs34,	vs2,	vs24
+	xvmuldp			vs35,	vs3,	vs24
+	xvmuldp			vs36,	vs4,	vs24
+	xvmuldp			vs37,	vs5,	vs24
+	xvmuldp			vs38,	vs6,	vs24
+	xvmuldp			vs39,	vs7,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+	xvmuldp			vs41,	vs1,	vs25
+	xvmuldp			vs42,	vs2,	vs25
+	xvmuldp			vs43,	vs3,	vs25
+	xvmuldp			vs44,	vs4,	vs25
+	xvmuldp			vs45,	vs5,	vs25
+	xvmuldp			vs46,	vs6,	vs25
+	xvmuldp			vs47,	vs7,	vs25
+
+	xvmuldp			vs48,	vs0,	vs26
+	xvmuldp			vs49,	vs1,	vs26
+	xvmuldp			vs50,	vs2,	vs26
+	xvmuldp			vs51,	vs3,	vs26
+	xvmuldp			vs52,	vs4,	vs26
+	xvmuldp			vs53,	vs5,	vs26
+	xvmuldp			vs54,	vs6,	vs26
+	xvmuldp			vs55,	vs7,	vs26
+
+	xvmuldp			vs56,	vs0,	vs27
+	xvmuldp			vs57,	vs1,	vs27
+	xvmuldp			vs58,	vs2,	vs27
+	xvmuldp			vs59,	vs3,	vs27
+	xvmuldp			vs60,	vs4,	vs27
+	xvmuldp			vs61,	vs5,	vs27
+	xvmuldp			vs62,	vs6,	vs27
+	xvmuldp			vs63,	vs7,	vs27
+
+.endm
+
+.macro KERNEL4x16_SUB1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+	lxvdsx	vs26,	o16,	BO
+	lxvdsx	vs27,	o24,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 32
+
+	lxvd2x	vs4,	0,	AO
+	lxvd2x	vs5,	o16,	AO
+	lxvd2x	vs6,	o32,	AO
+	lxvd2x	vs7,	o48,	AO
+
+	addi		AO, AO, 64
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+	xvmaddadp		vs36,	vs4,	vs24
+	xvmaddadp		vs37,	vs5,	vs24
+	xvmaddadp		vs38,	vs6,	vs24
+	xvmaddadp		vs39,	vs7,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+	xvmaddadp		vs42,	vs2,	vs25
+	xvmaddadp		vs43,	vs3,	vs25
+	xvmaddadp		vs44,	vs4,	vs25
+	xvmaddadp		vs45,	vs5,	vs25
+	xvmaddadp		vs46,	vs6,	vs25
+	xvmaddadp		vs47,	vs7,	vs25
+
+	xvmaddadp		vs48,	vs0,	vs26
+	xvmaddadp		vs49,	vs1,	vs26
+	xvmaddadp		vs50,	vs2,	vs26
+	xvmaddadp		vs51,	vs3,	vs26
+	xvmaddadp		vs52,	vs4,	vs26
+	xvmaddadp		vs53,	vs5,	vs26
+	xvmaddadp		vs54,	vs6,	vs26
+	xvmaddadp		vs55,	vs7,	vs26
+
+	xvmaddadp		vs56,	vs0,	vs27
+	xvmaddadp		vs57,	vs1,	vs27
+	xvmaddadp		vs58,	vs2,	vs27
+	xvmaddadp		vs59,	vs3,	vs27
+	xvmaddadp		vs60,	vs4,	vs27
+	xvmaddadp		vs61,	vs5,	vs27
+	xvmaddadp		vs62,	vs6,	vs27
+	xvmaddadp		vs63,	vs7,	vs27
+
+.endm
+
+.macro SAVE4x16
+
+	mr		T1,	CO
+	addi		T2,	T1,	64
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+	lxvd2x		vs1,	o16,	T1
+	lxvd2x		vs2,	o32,	T1
+	lxvd2x		vs3,	o48,	T1
+
+	lxvd2x		vs4,	0,	T2
+	lxvd2x		vs5,	o16,	T2
+	lxvd2x		vs6,	o32,	T2
+	lxvd2x		vs7,	o48,	T2
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs32,	alpha_r
+	xvmaddadp	vs1,	vs33,	alpha_r
+	xvmaddadp	vs2,	vs34,	alpha_r
+	xvmaddadp	vs3,	vs35,	alpha_r
+	xvmaddadp	vs4,	vs36,	alpha_r
+	xvmaddadp	vs5,	vs37,	alpha_r
+	xvmaddadp	vs6,	vs38,	alpha_r
+	xvmaddadp	vs7,	vs39,	alpha_r
+#else
+	xvmuldp		vs0,	vs32,	alpha_r
+	xvmuldp		vs1,	vs33,	alpha_r
+	xvmuldp		vs2,	vs34,	alpha_r
+	xvmuldp		vs3,	vs35,	alpha_r
+	xvmuldp		vs4,	vs36,	alpha_r
+	xvmuldp		vs5,	vs37,	alpha_r
+	xvmuldp		vs6,	vs38,	alpha_r
+	xvmuldp		vs7,	vs39,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+	stxvd2x		vs1,	o16,	T1
+	stxvd2x		vs2,	o32,	T1
+	stxvd2x		vs3,	o48,	T1
+
+	dcbt		T1, PRE
+
+	stxvd2x		vs4,	0,	T2
+	stxvd2x		vs5,	o16,	T2
+	stxvd2x		vs6,	o32,	T2
+	stxvd2x		vs7,	o48,	T2
+
+	add		T1,	T1,	LDC
+	add		T2,	T2,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs8,	0,	T1
+	lxvd2x		vs9,	o16,	T1
+	lxvd2x		vs10,	o32,	T1
+	lxvd2x		vs11,	o48,	T1
+
+	lxvd2x		vs12,	0,	T2
+	lxvd2x		vs13,	o16,	T2
+	lxvd2x		vs14,	o32,	T2
+	lxvd2x		vs15,	o48,	T2
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs8,	vs40,	alpha_r
+	xvmaddadp	vs9,	vs41,	alpha_r
+	xvmaddadp	vs10,	vs42,	alpha_r
+	xvmaddadp	vs11,	vs43,	alpha_r
+	xvmaddadp	vs12,	vs44,	alpha_r
+	xvmaddadp	vs13,	vs45,	alpha_r
+	xvmaddadp	vs14,	vs46,	alpha_r
+	xvmaddadp	vs15,	vs47,	alpha_r
+#else
+	xvmuldp		vs8,	vs40,	alpha_r
+	xvmuldp		vs9,	vs41,	alpha_r
+	xvmuldp		vs10,	vs42,	alpha_r
+	xvmuldp		vs11,	vs43,	alpha_r
+	xvmuldp		vs12,	vs44,	alpha_r
+	xvmuldp		vs13,	vs45,	alpha_r
+	xvmuldp		vs14,	vs46,	alpha_r
+	xvmuldp		vs15,	vs47,	alpha_r
+#endif
+
+	stxvd2x		vs8,	0,	T1
+	stxvd2x		vs9,	o16,	T1
+	stxvd2x		vs10,	o32,	T1
+	stxvd2x		vs11,	o48,	T1
+
+	dcbt		T1, PRE
+
+	stxvd2x		vs12,	0,	T2
+	stxvd2x		vs13,	o16,	T2
+	stxvd2x		vs14,	o32,	T2
+	stxvd2x		vs15,	o48,	T2
+
+	add		T1,	T1,	LDC
+	add		T2,	T2,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+	lxvd2x		vs1,	o16,	T1
+	lxvd2x		vs2,	o32,	T1
+	lxvd2x		vs3,	o48,	T1
+
+	lxvd2x		vs4,	0,	T2
+	lxvd2x		vs5,	o16,	T2
+	lxvd2x		vs6,	o32,	T2
+	lxvd2x		vs7,	o48,	T2
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs48,	alpha_r
+	xvmaddadp	vs1,	vs49,	alpha_r
+	xvmaddadp	vs2,	vs50,	alpha_r
+	xvmaddadp	vs3,	vs51,	alpha_r
+	xvmaddadp	vs4,	vs52,	alpha_r
+	xvmaddadp	vs5,	vs53,	alpha_r
+	xvmaddadp	vs6,	vs54,	alpha_r
+	xvmaddadp	vs7,	vs55,	alpha_r
+#else
+	xvmuldp		vs0,	vs48,	alpha_r
+	xvmuldp		vs1,	vs49,	alpha_r
+	xvmuldp		vs2,	vs50,	alpha_r
+	xvmuldp		vs3,	vs51,	alpha_r
+	xvmuldp		vs4,	vs52,	alpha_r
+	xvmuldp		vs5,	vs53,	alpha_r
+	xvmuldp		vs6,	vs54,	alpha_r
+	xvmuldp		vs7,	vs55,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+	stxvd2x		vs1,	o16,	T1
+	stxvd2x		vs2,	o32,	T1
+	stxvd2x		vs3,	o48,	T1
+
+	dcbt		T1, PRE
+
+	stxvd2x		vs4,	0,	T2
+	stxvd2x		vs5,	o16,	T2
+	stxvd2x		vs6,	o32,	T2
+	stxvd2x		vs7,	o48,	T2
+
+	add		T1,	T1,	LDC
+	add		T2,	T2,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs8,	0,	T1
+	lxvd2x		vs9,	o16,	T1
+	lxvd2x		vs10,	o32,	T1
+	lxvd2x		vs11,	o48,	T1
+
+	lxvd2x		vs12,	0,	T2
+	lxvd2x		vs13,	o16,	T2
+	lxvd2x		vs14,	o32,	T2
+	lxvd2x		vs15,	o48,	T2
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs8,	vs56,	alpha_r
+	xvmaddadp	vs9,	vs57,	alpha_r
+	xvmaddadp	vs10,	vs58,	alpha_r
+	xvmaddadp	vs11,	vs59,	alpha_r
+	xvmaddadp	vs12,	vs60,	alpha_r
+	xvmaddadp	vs13,	vs61,	alpha_r
+	xvmaddadp	vs14,	vs62,	alpha_r
+	xvmaddadp	vs15,	vs63,	alpha_r
+#else
+	xvmuldp		vs8,	vs56,	alpha_r
+	xvmuldp		vs9,	vs57,	alpha_r
+	xvmuldp		vs10,	vs58,	alpha_r
+	xvmuldp		vs11,	vs59,	alpha_r
+	xvmuldp		vs12,	vs60,	alpha_r
+	xvmuldp		vs13,	vs61,	alpha_r
+	xvmuldp		vs14,	vs62,	alpha_r
+	xvmuldp		vs15,	vs63,	alpha_r
+#endif
+
+	stxvd2x		vs8,	0,	T1
+	stxvd2x		vs9,	o16,	T1
+	stxvd2x		vs10,	o32,	T1
+	stxvd2x		vs11,	o48,	T1
+
+	dcbt		T1, PRE
+
+	stxvd2x		vs12,	0,	T2
+	stxvd2x		vs13,	o16,	T2
+	stxvd2x		vs14,	o32,	T2
+	stxvd2x		vs15,	o48,	T2
+
+	addi		CO,	CO,	128
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=8                                                *
+*********************************************************************/
+
+.macro LOAD4x8_1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs26,	o16,	BO
+	lxvdsx	vs27,	o24,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 32
+
+.endm
+
+.macro KERNEL4x8_I1
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+	xvmuldp			vs34,	vs2,	vs24
+	xvmuldp			vs35,	vs3,	vs24
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+
+	xvmuldp			vs40,	vs0,	vs25
+	xvmuldp			vs41,	vs1,	vs25
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+
+	xvmuldp			vs42,	vs2,	vs25
+	xvmuldp			vs43,	vs3,	vs25
+
+	xvmuldp			vs48,	vs0,	vs26
+	xvmuldp			vs49,	vs1,	vs26
+
+	lxvd2x	vs10,	o32,	AO
+	lxvd2x	vs11,	o48,	AO
+
+	xvmuldp			vs50,	vs2,	vs26
+	xvmuldp			vs51,	vs3,	vs26
+
+	lxvdsx	vs30,	o16,	BO
+	lxvdsx	vs31,	o24,	BO
+
+	xvmuldp			vs56,	vs0,	vs27
+	xvmuldp			vs57,	vs1,	vs27
+	xvmuldp			vs58,	vs2,	vs27
+	xvmuldp			vs59,	vs3,	vs27
+
+	addi		AO, AO, 64
+	addi		BO, BO, 32
+
+.endm
+
+.macro KERNEL4x8_1
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+	xvmaddadp		vs42,	vs2,	vs25
+	xvmaddadp		vs43,	vs3,	vs25
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+
+	xvmaddadp		vs48,	vs0,	vs26
+	xvmaddadp		vs49,	vs1,	vs26
+
+	lxvd2x	vs10,	o32,	AO
+	lxvd2x	vs11,	o48,	AO
+
+	xvmaddadp		vs50,	vs2,	vs26
+	xvmaddadp		vs51,	vs3,	vs26
+
+	lxvdsx	vs30,	o16,	BO
+	lxvdsx	vs31,	o24,	BO
+
+	xvmaddadp		vs56,	vs0,	vs27
+	xvmaddadp		vs57,	vs1,	vs27
+	xvmaddadp		vs58,	vs2,	vs27
+	xvmaddadp		vs59,	vs3,	vs27
+
+	addi		AO, AO, 64
+	addi		BO, BO, 32
+
+.endm
+
+.macro KERNEL4x8_2
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+	xvmaddadp		vs34,	vs10,	vs28
+	xvmaddadp		vs35,	vs11,	vs28
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	xvmaddadp		vs40,	vs8,	vs29
+	xvmaddadp		vs41,	vs9,	vs29
+	xvmaddadp		vs42,	vs10,	vs29
+	xvmaddadp		vs43,	vs11,	vs29
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	xvmaddadp		vs48,	vs8,	vs30
+	xvmaddadp		vs49,	vs9,	vs30
+
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	xvmaddadp		vs50,	vs10,	vs30
+	xvmaddadp		vs51,	vs11,	vs30
+
+	lxvdsx	vs26,	o16,	BO
+	lxvdsx	vs27,	o24,	BO
+
+	xvmaddadp		vs56,	vs8,	vs31
+	xvmaddadp		vs57,	vs9,	vs31
+	xvmaddadp		vs58,	vs10,	vs31
+	xvmaddadp		vs59,	vs11,	vs31
+
+	addi		AO, AO, 64
+	addi		BO, BO, 32
+
+.endm
+
+.macro KERNEL4x8_E2
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+	xvmaddadp		vs34,	vs10,	vs28
+	xvmaddadp		vs35,	vs11,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+	xvmaddadp		vs41,	vs9,	vs29
+	xvmaddadp		vs42,	vs10,	vs29
+	xvmaddadp		vs43,	vs11,	vs29
+
+	xvmaddadp		vs48,	vs8,	vs30
+	xvmaddadp		vs49,	vs9,	vs30
+	xvmaddadp		vs50,	vs10,	vs30
+	xvmaddadp		vs51,	vs11,	vs30
+
+	xvmaddadp		vs56,	vs8,	vs31
+	xvmaddadp		vs57,	vs9,	vs31
+	xvmaddadp		vs58,	vs10,	vs31
+	xvmaddadp		vs59,	vs11,	vs31
+
+.endm
+
+.macro KERNEL4x8_SUBI1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+	lxvdsx	vs26,	o16,	BO
+	lxvdsx	vs27,	o24,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 32
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+	xvmuldp			vs34,	vs2,	vs24
+	xvmuldp			vs35,	vs3,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+	xvmuldp			vs41,	vs1,	vs25
+	xvmuldp			vs42,	vs2,	vs25
+	xvmuldp			vs43,	vs3,	vs25
+
+	xvmuldp			vs48,	vs0,	vs26
+	xvmuldp			vs49,	vs1,	vs26
+	xvmuldp			vs50,	vs2,	vs26
+	xvmuldp			vs51,	vs3,	vs26
+
+	xvmuldp			vs56,	vs0,	vs27
+	xvmuldp			vs57,	vs1,	vs27
+	xvmuldp			vs58,	vs2,	vs27
+	xvmuldp			vs59,	vs3,	vs27
+
+.endm
+
+.macro KERNEL4x8_SUB1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+	lxvdsx	vs26,	o16,	BO
+	lxvdsx	vs27,	o24,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 32
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+	xvmaddadp		vs42,	vs2,	vs25
+	xvmaddadp		vs43,	vs3,	vs25
+
+	xvmaddadp		vs48,	vs0,	vs26
+	xvmaddadp		vs49,	vs1,	vs26
+	xvmaddadp		vs50,	vs2,	vs26
+	xvmaddadp		vs51,	vs3,	vs26
+
+	xvmaddadp		vs56,	vs0,	vs27
+	xvmaddadp		vs57,	vs1,	vs27
+	xvmaddadp		vs58,	vs2,	vs27
+	xvmaddadp		vs59,	vs3,	vs27
+
+.endm
+
+.macro SAVE4x8
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+	lxvd2x		vs1,	o16,	T1
+	lxvd2x		vs2,	o32,	T1
+	lxvd2x		vs3,	o48,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs32,	alpha_r
+	xvmaddadp	vs1,	vs33,	alpha_r
+	xvmaddadp	vs2,	vs34,	alpha_r
+	xvmaddadp	vs3,	vs35,	alpha_r
+#else
+	xvmuldp		vs0,	vs32,	alpha_r
+	xvmuldp		vs1,	vs33,	alpha_r
+	xvmuldp		vs2,	vs34,	alpha_r
+	xvmuldp		vs3,	vs35,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+	stxvd2x		vs1,	o16,	T1
+	stxvd2x		vs2,	o32,	T1
+	stxvd2x		vs3,	o48,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs8,	0,	T1
+	lxvd2x		vs9,	o16,	T1
+	lxvd2x		vs10,	o32,	T1
+	lxvd2x		vs11,	o48,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs8,	vs40,	alpha_r
+	xvmaddadp	vs9,	vs41,	alpha_r
+	xvmaddadp	vs10,	vs42,	alpha_r
+	xvmaddadp	vs11,	vs43,	alpha_r
+#else
+	xvmuldp		vs8,	vs40,	alpha_r
+	xvmuldp		vs9,	vs41,	alpha_r
+	xvmuldp		vs10,	vs42,	alpha_r
+	xvmuldp		vs11,	vs43,	alpha_r
+#endif
+
+	stxvd2x		vs8,	0,	T1
+	stxvd2x		vs9,	o16,	T1
+	stxvd2x		vs10,	o32,	T1
+	stxvd2x		vs11,	o48,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+	lxvd2x		vs1,	o16,	T1
+	lxvd2x		vs2,	o32,	T1
+	lxvd2x		vs3,	o48,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs48,	alpha_r
+	xvmaddadp	vs1,	vs49,	alpha_r
+	xvmaddadp	vs2,	vs50,	alpha_r
+	xvmaddadp	vs3,	vs51,	alpha_r
+#else
+	xvmuldp		vs0,	vs48,	alpha_r
+	xvmuldp		vs1,	vs49,	alpha_r
+	xvmuldp		vs2,	vs50,	alpha_r
+	xvmuldp		vs3,	vs51,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+	stxvd2x		vs1,	o16,	T1
+	stxvd2x		vs2,	o32,	T1
+	stxvd2x		vs3,	o48,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs8,	0,	T1
+	lxvd2x		vs9,	o16,	T1
+	lxvd2x		vs10,	o32,	T1
+	lxvd2x		vs11,	o48,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs8,	vs56,	alpha_r
+	xvmaddadp	vs9,	vs57,	alpha_r
+	xvmaddadp	vs10,	vs58,	alpha_r
+	xvmaddadp	vs11,	vs59,	alpha_r
+#else
+	xvmuldp		vs8,	vs56,	alpha_r
+	xvmuldp		vs9,	vs57,	alpha_r
+	xvmuldp		vs10,	vs58,	alpha_r
+	xvmuldp		vs11,	vs59,	alpha_r
+#endif
+
+	stxvd2x		vs8,	0,	T1
+	stxvd2x		vs9,	o16,	T1
+	stxvd2x		vs10,	o32,	T1
+	stxvd2x		vs11,	o48,	T1
+
+	addi		CO,	CO,	64
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=4                                                *
+*********************************************************************/
+
+.macro LOAD4x4_1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+	lxvdsx	vs26,	o16,	BO
+	lxvdsx	vs27,	o24,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 32
+
+.endm
+
+.macro KERNEL4x4_I1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+	lxvdsx	vs30,	o16,	BO
+	lxvdsx	vs31,	o24,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 32
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+	xvmuldp			vs41,	vs1,	vs25
+
+	xvmuldp			vs48,	vs0,	vs26
+	xvmuldp			vs49,	vs1,	vs26
+
+	xvmuldp			vs56,	vs0,	vs27
+	xvmuldp			vs57,	vs1,	vs27
+
+.endm
+
+.macro KERNEL4x4_1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+	lxvdsx	vs30,	o16,	BO
+	lxvdsx	vs31,	o24,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 32
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+
+	xvmaddadp		vs48,	vs0,	vs26
+	xvmaddadp		vs49,	vs1,	vs26
+
+	xvmaddadp		vs56,	vs0,	vs27
+	xvmaddadp		vs57,	vs1,	vs27
+
+.endm
+
+.macro KERNEL4x4_2
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+	lxvdsx	vs26,	o16,	BO
+	lxvdsx	vs27,	o24,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 32
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+	xvmaddadp		vs41,	vs9,	vs29
+
+	xvmaddadp		vs48,	vs8,	vs30
+	xvmaddadp		vs49,	vs9,	vs30
+
+	xvmaddadp		vs56,	vs8,	vs31
+	xvmaddadp		vs57,	vs9,	vs31
+
+.endm
+
+.macro KERNEL4x4_E2
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+	xvmaddadp		vs41,	vs9,	vs29
+
+	xvmaddadp		vs48,	vs8,	vs30
+	xvmaddadp		vs49,	vs9,	vs30
+
+	xvmaddadp		vs56,	vs8,	vs31
+	xvmaddadp		vs57,	vs9,	vs31
+
+.endm
+
+.macro KERNEL4x4_SUBI1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+	lxvdsx	vs26,	o16,	BO
+	lxvdsx	vs27,	o24,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 32
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+	xvmuldp			vs41,	vs1,	vs25
+
+	xvmuldp			vs48,	vs0,	vs26
+	xvmuldp			vs49,	vs1,	vs26
+
+	xvmuldp			vs56,	vs0,	vs27
+	xvmuldp			vs57,	vs1,	vs27
+
+.endm
+
+.macro KERNEL4x4_SUB1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+	lxvdsx	vs26,	o16,	BO
+	lxvdsx	vs27,	o24,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 32
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+
+	xvmaddadp		vs48,	vs0,	vs26
+	xvmaddadp		vs49,	vs1,	vs26
+
+	xvmaddadp		vs56,	vs0,	vs27
+	xvmaddadp		vs57,	vs1,	vs27
+
+.endm
+
+.macro SAVE4x4
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+	lxvd2x		vs1,	o16,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs32,	alpha_r
+	xvmaddadp	vs1,	vs33,	alpha_r
+#else
+	xvmuldp		vs0,	vs32,	alpha_r
+	xvmuldp		vs1,	vs33,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+	stxvd2x		vs1,	o16,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs8,	0,	T1
+	lxvd2x		vs9,	o16,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs8,	vs40,	alpha_r
+	xvmaddadp	vs9,	vs41,	alpha_r
+#else
+	xvmuldp		vs8,	vs40,	alpha_r
+	xvmuldp		vs9,	vs41,	alpha_r
+#endif
+
+	stxvd2x		vs8,	0,	T1
+	stxvd2x		vs9,	o16,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+	lxvd2x		vs1,	o16,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs48,	alpha_r
+	xvmaddadp	vs1,	vs49,	alpha_r
+#else
+	xvmuldp		vs0,	vs48,	alpha_r
+	xvmuldp		vs1,	vs49,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+	stxvd2x		vs1,	o16,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs8,	0,	T1
+	lxvd2x		vs9,	o16,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs8,	vs56,	alpha_r
+	xvmaddadp	vs9,	vs57,	alpha_r
+#else
+	xvmuldp		vs8,	vs56,	alpha_r
+	xvmuldp		vs9,	vs57,	alpha_r
+#endif
+
+	stxvd2x		vs8,	0,	T1
+	stxvd2x		vs9,	o16,	T1
+
+	addi		CO,	CO,	32
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=2                                                *
+*********************************************************************/
+
+.macro LOAD4x2_1
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+	lxvdsx	vs26,	o16,	BO
+	lxvdsx	vs27,	o24,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 32
+
+.endm
+
+.macro KERNEL4x2_I1
+
+	lxvd2x	vs8,	0,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+	lxvdsx	vs30,	o16,	BO
+	lxvdsx	vs31,	o24,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 32
+
+
+	xvmuldp			vs32,	vs0,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+
+	xvmuldp			vs48,	vs0,	vs26
+
+	xvmuldp			vs56,	vs0,	vs27
+
+.endm
+
+.macro KERNEL4x2_1
+
+	lxvd2x	vs8,	0,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+	lxvdsx	vs30,	o16,	BO
+	lxvdsx	vs31,	o24,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 32
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+
+	xvmaddadp		vs48,	vs0,	vs26
+
+	xvmaddadp		vs56,	vs0,	vs27
+
+.endm
+
+.macro KERNEL4x2_2
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+	lxvdsx	vs26,	o16,	BO
+	lxvdsx	vs27,	o24,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 32
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+
+	xvmaddadp		vs48,	vs8,	vs30
+
+	xvmaddadp		vs56,	vs8,	vs31
+
+.endm
+
+.macro KERNEL4x2_E2
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+
+	xvmaddadp		vs48,	vs8,	vs30
+
+	xvmaddadp		vs56,	vs8,	vs31
+
+.endm
+
+.macro KERNEL4x2_SUBI1
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+	lxvdsx	vs26,	o16,	BO
+	lxvdsx	vs27,	o24,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 32
+
+
+	xvmuldp			vs32,	vs0,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+
+	xvmuldp			vs48,	vs0,	vs26
+
+	xvmuldp			vs56,	vs0,	vs27
+
+.endm
+
+.macro KERNEL4x2_SUB1
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+	lxvdsx	vs26,	o16,	BO
+	lxvdsx	vs27,	o24,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 32
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+
+	xvmaddadp		vs48,	vs0,	vs26
+
+	xvmaddadp		vs56,	vs0,	vs27
+
+.endm
+
+.macro SAVE4x2
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs32,	alpha_r
+#else
+	xvmuldp		vs0,	vs32,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs8,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs8,	vs40,	alpha_r
+#else
+	xvmuldp		vs8,	vs40,	alpha_r
+#endif
+
+	stxvd2x		vs8,	0,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs48,	alpha_r
+#else
+	xvmuldp		vs0,	vs48,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs8,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs8,	vs56,	alpha_r
+#else
+	xvmuldp		vs8,	vs56,	alpha_r
+#endif
+
+	stxvd2x		vs8,	0,	T1
+
+	addi		CO,	CO,	16
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=1                                                *
+*********************************************************************/
+
+.macro LOAD4x1_1
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+	lxsdx	vs25,	o8,	BO
+	lxsdx	vs26,	o16,	BO
+	lxsdx	vs27,	o24,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 32
+
+.endm
+
+.macro KERNEL4x1_I1
+
+	lxsdx	vs8,	0,	AO
+
+	lxsdx	vs28,	0,	BO
+	lxsdx	vs29,	o8,	BO
+	lxsdx	vs30,	o16,	BO
+	lxsdx	vs31,	o24,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 32
+
+
+	xsmuldp			vs32,	vs0,	vs24
+
+	xsmuldp			vs40,	vs0,	vs25
+
+	xsmuldp			vs48,	vs0,	vs26
+
+	xsmuldp			vs56,	vs0,	vs27
+
+.endm
+
+.macro KERNEL4x1_1
+
+	lxsdx	vs8,	0,	AO
+
+	lxsdx	vs28,	0,	BO
+	lxsdx	vs29,	o8,	BO
+	lxsdx	vs30,	o16,	BO
+	lxsdx	vs31,	o24,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 32
+
+
+	xsmaddadp		vs32,	vs0,	vs24
+
+	xsmaddadp		vs40,	vs0,	vs25
+
+	xsmaddadp		vs48,	vs0,	vs26
+
+	xsmaddadp		vs56,	vs0,	vs27
+
+.endm
+
+.macro KERNEL4x1_2
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+	lxsdx	vs25,	o8,	BO
+	lxsdx	vs26,	o16,	BO
+	lxsdx	vs27,	o24,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 32
+
+
+	xsmaddadp		vs32,	vs8,	vs28
+
+	xsmaddadp		vs40,	vs8,	vs29
+
+	xsmaddadp		vs48,	vs8,	vs30
+
+	xsmaddadp		vs56,	vs8,	vs31
+
+.endm
+
+.macro KERNEL4x1_E2
+
+
+	xsmaddadp		vs32,	vs8,	vs28
+
+	xsmaddadp		vs40,	vs8,	vs29
+
+	xsmaddadp		vs48,	vs8,	vs30
+
+	xsmaddadp		vs56,	vs8,	vs31
+
+.endm
+
+.macro KERNEL4x1_SUBI1
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+	lxsdx	vs25,	o8,	BO
+	lxsdx	vs26,	o16,	BO
+	lxsdx	vs27,	o24,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 32
+
+
+	xsmuldp			vs32,	vs0,	vs24
+
+	xsmuldp			vs40,	vs0,	vs25
+
+	xsmuldp			vs48,	vs0,	vs26
+
+	xsmuldp			vs56,	vs0,	vs27
+
+.endm
+
+.macro KERNEL4x1_SUB1
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+	lxsdx	vs25,	o8,	BO
+	lxsdx	vs26,	o16,	BO
+	lxsdx	vs27,	o24,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 32
+
+
+	xsmaddadp		vs32,	vs0,	vs24
+
+	xsmaddadp		vs40,	vs0,	vs25
+
+	xsmaddadp		vs48,	vs0,	vs26
+
+	xsmaddadp		vs56,	vs0,	vs27
+
+.endm
+
+.macro SAVE4x1
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+	lxsdx		vs0,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xsmaddadp	vs0,	vs32,	alpha_r
+#else
+	xsmuldp		vs0,	vs32,	alpha_r
+#endif
+
+	stxsdx		vs0,	0,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxsdx		vs8,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xsmaddadp	vs8,	vs40,	alpha_r
+#else
+	xsmuldp		vs8,	vs40,	alpha_r
+#endif
+
+	stxsdx		vs8,	0,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxsdx		vs0,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xsmaddadp	vs0,	vs48,	alpha_r
+#else
+	xsmuldp		vs0,	vs48,	alpha_r
+#endif
+
+	stxsdx		vs0,	0,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxsdx		vs8,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xsmaddadp	vs8,	vs56,	alpha_r
+#else
+	xsmuldp		vs8,	vs56,	alpha_r
+#endif
+
+	stxsdx		vs8,	0,	T1
+
+	addi		CO,	CO,	8
+
+.endm
+
+/*********************************************************************
+* Macros for N=2, M=16                                               *
+*********************************************************************/
+
+.macro LOAD2x16_1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+	lxvd2x	vs4,	0,	AO
+	lxvd2x	vs5,	o16,	AO
+	lxvd2x	vs6,	o32,	AO
+	lxvd2x	vs7,	o48,	AO
+
+	addi		AO, AO, 64
+
+.endm
+
+.macro KERNEL2x16_I1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+	lxvd2x	vs10,	o32,	AO
+	lxvd2x	vs11,	o48,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+	lxvd2x	vs12,	0,	AO
+	lxvd2x	vs13,	o16,	AO
+	lxvd2x	vs14,	o32,	AO
+	lxvd2x	vs15,	o48,	AO
+
+	addi		AO, AO, 64
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+	xvmuldp			vs34,	vs2,	vs24
+	xvmuldp			vs35,	vs3,	vs24
+	xvmuldp			vs36,	vs4,	vs24
+	xvmuldp			vs37,	vs5,	vs24
+	xvmuldp			vs38,	vs6,	vs24
+	xvmuldp			vs39,	vs7,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+	xvmuldp			vs41,	vs1,	vs25
+	xvmuldp			vs42,	vs2,	vs25
+	xvmuldp			vs43,	vs3,	vs25
+	xvmuldp			vs44,	vs4,	vs25
+	xvmuldp			vs45,	vs5,	vs25
+	xvmuldp			vs46,	vs6,	vs25
+	xvmuldp			vs47,	vs7,	vs25
+
+.endm
+
+.macro KERNEL2x16_1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+	lxvd2x	vs10,	o32,	AO
+	lxvd2x	vs11,	o48,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+	lxvd2x	vs12,	0,	AO
+	lxvd2x	vs13,	o16,	AO
+	lxvd2x	vs14,	o32,	AO
+	lxvd2x	vs15,	o48,	AO
+
+	addi		AO, AO, 64
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+	xvmaddadp		vs36,	vs4,	vs24
+	xvmaddadp		vs37,	vs5,	vs24
+	xvmaddadp		vs38,	vs6,	vs24
+	xvmaddadp		vs39,	vs7,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+	xvmaddadp		vs42,	vs2,	vs25
+	xvmaddadp		vs43,	vs3,	vs25
+	xvmaddadp		vs44,	vs4,	vs25
+	xvmaddadp		vs45,	vs5,	vs25
+	xvmaddadp		vs46,	vs6,	vs25
+	xvmaddadp		vs47,	vs7,	vs25
+
+.endm
+
+.macro KERNEL2x16_2
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+	lxvd2x	vs4,	0,	AO
+	lxvd2x	vs5,	o16,	AO
+	lxvd2x	vs6,	o32,	AO
+	lxvd2x	vs7,	o48,	AO
+
+	addi		AO, AO, 64
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+	xvmaddadp		vs34,	vs10,	vs28
+	xvmaddadp		vs35,	vs11,	vs28
+	xvmaddadp		vs36,	vs12,	vs28
+	xvmaddadp		vs37,	vs13,	vs28
+	xvmaddadp		vs38,	vs14,	vs28
+	xvmaddadp		vs39,	vs15,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+	xvmaddadp		vs41,	vs9,	vs29
+	xvmaddadp		vs42,	vs10,	vs29
+	xvmaddadp		vs43,	vs11,	vs29
+	xvmaddadp		vs44,	vs12,	vs29
+	xvmaddadp		vs45,	vs13,	vs29
+	xvmaddadp		vs46,	vs14,	vs29
+	xvmaddadp		vs47,	vs15,	vs29
+
+.endm
+
+.macro KERNEL2x16_E2
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+	xvmaddadp		vs34,	vs10,	vs28
+	xvmaddadp		vs35,	vs11,	vs28
+	xvmaddadp		vs36,	vs12,	vs28
+	xvmaddadp		vs37,	vs13,	vs28
+	xvmaddadp		vs38,	vs14,	vs28
+	xvmaddadp		vs39,	vs15,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+	xvmaddadp		vs41,	vs9,	vs29
+	xvmaddadp		vs42,	vs10,	vs29
+	xvmaddadp		vs43,	vs11,	vs29
+	xvmaddadp		vs44,	vs12,	vs29
+	xvmaddadp		vs45,	vs13,	vs29
+	xvmaddadp		vs46,	vs14,	vs29
+	xvmaddadp		vs47,	vs15,	vs29
+
+.endm
+
+.macro KERNEL2x16_SUBI1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+	lxvd2x	vs4,	0,	AO
+	lxvd2x	vs5,	o16,	AO
+	lxvd2x	vs6,	o32,	AO
+	lxvd2x	vs7,	o48,	AO
+
+	addi		AO, AO, 64
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+	xvmuldp			vs34,	vs2,	vs24
+	xvmuldp			vs35,	vs3,	vs24
+	xvmuldp			vs36,	vs4,	vs24
+	xvmuldp			vs37,	vs5,	vs24
+	xvmuldp			vs38,	vs6,	vs24
+	xvmuldp			vs39,	vs7,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+	xvmuldp			vs41,	vs1,	vs25
+	xvmuldp			vs42,	vs2,	vs25
+	xvmuldp			vs43,	vs3,	vs25
+	xvmuldp			vs44,	vs4,	vs25
+	xvmuldp			vs45,	vs5,	vs25
+	xvmuldp			vs46,	vs6,	vs25
+	xvmuldp			vs47,	vs7,	vs25
+
+.endm
+
+.macro KERNEL2x16_SUB1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+	lxvd2x	vs4,	0,	AO
+	lxvd2x	vs5,	o16,	AO
+	lxvd2x	vs6,	o32,	AO
+	lxvd2x	vs7,	o48,	AO
+
+	addi		AO, AO, 64
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+	xvmaddadp		vs36,	vs4,	vs24
+	xvmaddadp		vs37,	vs5,	vs24
+	xvmaddadp		vs38,	vs6,	vs24
+	xvmaddadp		vs39,	vs7,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+	xvmaddadp		vs42,	vs2,	vs25
+	xvmaddadp		vs43,	vs3,	vs25
+	xvmaddadp		vs44,	vs4,	vs25
+	xvmaddadp		vs45,	vs5,	vs25
+	xvmaddadp		vs46,	vs6,	vs25
+	xvmaddadp		vs47,	vs7,	vs25
+
+.endm
+
+.macro SAVE2x16
+
+	mr		T1,	CO
+	addi		T2,	T1,	64
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+	lxvd2x		vs1,	o16,	T1
+	lxvd2x		vs2,	o32,	T1
+	lxvd2x		vs3,	o48,	T1
+
+	lxvd2x		vs4,	0,	T2
+	lxvd2x		vs5,	o16,	T2
+	lxvd2x		vs6,	o32,	T2
+	lxvd2x		vs7,	o48,	T2
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs32,	alpha_r
+	xvmaddadp	vs1,	vs33,	alpha_r
+	xvmaddadp	vs2,	vs34,	alpha_r
+	xvmaddadp	vs3,	vs35,	alpha_r
+	xvmaddadp	vs4,	vs36,	alpha_r
+	xvmaddadp	vs5,	vs37,	alpha_r
+	xvmaddadp	vs6,	vs38,	alpha_r
+	xvmaddadp	vs7,	vs39,	alpha_r
+#else
+	xvmuldp		vs0,	vs32,	alpha_r
+	xvmuldp		vs1,	vs33,	alpha_r
+	xvmuldp		vs2,	vs34,	alpha_r
+	xvmuldp		vs3,	vs35,	alpha_r
+	xvmuldp		vs4,	vs36,	alpha_r
+	xvmuldp		vs5,	vs37,	alpha_r
+	xvmuldp		vs6,	vs38,	alpha_r
+	xvmuldp		vs7,	vs39,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+	stxvd2x		vs1,	o16,	T1
+	stxvd2x		vs2,	o32,	T1
+	stxvd2x		vs3,	o48,	T1
+
+	stxvd2x		vs4,	0,	T2
+	stxvd2x		vs5,	o16,	T2
+	stxvd2x		vs6,	o32,	T2
+	stxvd2x		vs7,	o48,	T2
+
+	add		T1,	T1,	LDC
+	add		T2,	T2,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs8,	0,	T1
+	lxvd2x		vs9,	o16,	T1
+	lxvd2x		vs10,	o32,	T1
+	lxvd2x		vs11,	o48,	T1
+
+	lxvd2x		vs12,	0,	T2
+	lxvd2x		vs13,	o16,	T2
+	lxvd2x		vs14,	o32,	T2
+	lxvd2x		vs15,	o48,	T2
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs8,	vs40,	alpha_r
+	xvmaddadp	vs9,	vs41,	alpha_r
+	xvmaddadp	vs10,	vs42,	alpha_r
+	xvmaddadp	vs11,	vs43,	alpha_r
+	xvmaddadp	vs12,	vs44,	alpha_r
+	xvmaddadp	vs13,	vs45,	alpha_r
+	xvmaddadp	vs14,	vs46,	alpha_r
+	xvmaddadp	vs15,	vs47,	alpha_r
+#else
+	xvmuldp		vs8,	vs40,	alpha_r
+	xvmuldp		vs9,	vs41,	alpha_r
+	xvmuldp		vs10,	vs42,	alpha_r
+	xvmuldp		vs11,	vs43,	alpha_r
+	xvmuldp		vs12,	vs44,	alpha_r
+	xvmuldp		vs13,	vs45,	alpha_r
+	xvmuldp		vs14,	vs46,	alpha_r
+	xvmuldp		vs15,	vs47,	alpha_r
+#endif
+
+	stxvd2x		vs8,	0,	T1
+	stxvd2x		vs9,	o16,	T1
+	stxvd2x		vs10,	o32,	T1
+	stxvd2x		vs11,	o48,	T1
+
+	stxvd2x		vs12,	0,	T2
+	stxvd2x		vs13,	o16,	T2
+	stxvd2x		vs14,	o32,	T2
+	stxvd2x		vs15,	o48,	T2
+
+	addi		CO,	CO,	128
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=8                                                *
+*********************************************************************/
+
+.macro LOAD2x8_1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+.endm
+
+.macro KERNEL2x8_I1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+	lxvd2x	vs10,	o32,	AO
+	lxvd2x	vs11,	o48,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+	xvmuldp			vs34,	vs2,	vs24
+	xvmuldp			vs35,	vs3,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+	xvmuldp			vs41,	vs1,	vs25
+	xvmuldp			vs42,	vs2,	vs25
+	xvmuldp			vs43,	vs3,	vs25
+
+.endm
+
+.macro KERNEL2x8_1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+	lxvd2x	vs10,	o32,	AO
+	lxvd2x	vs11,	o48,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+	xvmaddadp		vs42,	vs2,	vs25
+	xvmaddadp		vs43,	vs3,	vs25
+
+.endm
+
+.macro KERNEL2x8_2
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+	xvmaddadp		vs34,	vs10,	vs28
+	xvmaddadp		vs35,	vs11,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+	xvmaddadp		vs41,	vs9,	vs29
+	xvmaddadp		vs42,	vs10,	vs29
+	xvmaddadp		vs43,	vs11,	vs29
+
+.endm
+
+.macro KERNEL2x8_E2
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+	xvmaddadp		vs34,	vs10,	vs28
+	xvmaddadp		vs35,	vs11,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+	xvmaddadp		vs41,	vs9,	vs29
+	xvmaddadp		vs42,	vs10,	vs29
+	xvmaddadp		vs43,	vs11,	vs29
+
+.endm
+
+.macro KERNEL2x8_SUBI1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+	xvmuldp			vs34,	vs2,	vs24
+	xvmuldp			vs35,	vs3,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+	xvmuldp			vs41,	vs1,	vs25
+	xvmuldp			vs42,	vs2,	vs25
+	xvmuldp			vs43,	vs3,	vs25
+
+.endm
+
+.macro KERNEL2x8_SUB1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+	xvmaddadp		vs42,	vs2,	vs25
+	xvmaddadp		vs43,	vs3,	vs25
+
+.endm
+
+.macro SAVE2x8
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+	lxvd2x		vs1,	o16,	T1
+	lxvd2x		vs2,	o32,	T1
+	lxvd2x		vs3,	o48,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs32,	alpha_r
+	xvmaddadp	vs1,	vs33,	alpha_r
+	xvmaddadp	vs2,	vs34,	alpha_r
+	xvmaddadp	vs3,	vs35,	alpha_r
+#else
+	xvmuldp		vs0,	vs32,	alpha_r
+	xvmuldp		vs1,	vs33,	alpha_r
+	xvmuldp		vs2,	vs34,	alpha_r
+	xvmuldp		vs3,	vs35,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+	stxvd2x		vs1,	o16,	T1
+	stxvd2x		vs2,	o32,	T1
+	stxvd2x		vs3,	o48,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs8,	0,	T1
+	lxvd2x		vs9,	o16,	T1
+	lxvd2x		vs10,	o32,	T1
+	lxvd2x		vs11,	o48,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs8,	vs40,	alpha_r
+	xvmaddadp	vs9,	vs41,	alpha_r
+	xvmaddadp	vs10,	vs42,	alpha_r
+	xvmaddadp	vs11,	vs43,	alpha_r
+#else
+	xvmuldp		vs8,	vs40,	alpha_r
+	xvmuldp		vs9,	vs41,	alpha_r
+	xvmuldp		vs10,	vs42,	alpha_r
+	xvmuldp		vs11,	vs43,	alpha_r
+#endif
+
+	stxvd2x		vs8,	0,	T1
+	stxvd2x		vs9,	o16,	T1
+	stxvd2x		vs10,	o32,	T1
+	stxvd2x		vs11,	o48,	T1
+
+	addi		CO,	CO,	64
+
+.endm
+
+/*********************************************************************
+* Macros for N=2, M=4                                                *
+*********************************************************************/
+
+.macro LOAD2x4_1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 16
+
+.endm
+
+.macro KERNEL2x4_I1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 16
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+	xvmuldp			vs41,	vs1,	vs25
+
+.endm
+
+.macro KERNEL2x4_1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 16
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+
+.endm
+
+.macro KERNEL2x4_2
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 16
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+	xvmaddadp		vs41,	vs9,	vs29
+
+.endm
+
+.macro KERNEL2x4_E2
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+	xvmaddadp		vs41,	vs9,	vs29
+
+.endm
+
+.macro KERNEL2x4_SUBI1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 16
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+	xvmuldp			vs41,	vs1,	vs25
+
+.endm
+
+.macro KERNEL2x4_SUB1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 16
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+
+.endm
+
+.macro SAVE2x4
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+	lxvd2x		vs1,	o16,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs32,	alpha_r
+	xvmaddadp	vs1,	vs33,	alpha_r
+#else
+	xvmuldp		vs0,	vs32,	alpha_r
+	xvmuldp		vs1,	vs33,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+	stxvd2x		vs1,	o16,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs8,	0,	T1
+	lxvd2x		vs9,	o16,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs8,	vs40,	alpha_r
+	xvmaddadp	vs9,	vs41,	alpha_r
+#else
+	xvmuldp		vs8,	vs40,	alpha_r
+	xvmuldp		vs9,	vs41,	alpha_r
+#endif
+
+	stxvd2x		vs8,	0,	T1
+	stxvd2x		vs9,	o16,	T1
+
+	addi		CO,	CO,	32
+
+.endm
+
+/*********************************************************************
+* Macros for N=2, M=2                                                *
+*********************************************************************/
+
+.macro LOAD2x2_1
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 16
+
+.endm
+
+.macro KERNEL2x2_I1
+
+	lxvd2x	vs8,	0,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 16
+
+
+	xvmuldp			vs32,	vs0,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+
+.endm
+
+.macro KERNEL2x2_1
+
+	lxvd2x	vs8,	0,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 16
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+
+.endm
+
+.macro KERNEL2x2_2
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 16
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+
+.endm
+
+.macro KERNEL2x2_E2
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+
+.endm
+
+.macro KERNEL2x2_SUBI1
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 16
+
+
+	xvmuldp			vs32,	vs0,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+
+.endm
+
+.macro KERNEL2x2_SUB1
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 16
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+
+.endm
+
+.macro SAVE2x2
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs32,	alpha_r
+#else
+	xvmuldp		vs0,	vs32,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs8,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs8,	vs40,	alpha_r
+#else
+	xvmuldp		vs8,	vs40,	alpha_r
+#endif
+
+	stxvd2x		vs8,	0,	T1
+
+	addi		CO,	CO,	16
+
+.endm
+
+/*********************************************************************
+* Macros for N=2, M=1                                                *
+*********************************************************************/
+
+.macro LOAD2x1_1
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+	lxsdx	vs25,	o8,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 16
+
+.endm
+
+.macro KERNEL2x1_I1
+
+	lxsdx	vs8,	0,	AO
+
+	lxsdx	vs28,	0,	BO
+	lxsdx	vs29,	o8,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 16
+
+
+	xsmuldp			vs32,	vs0,	vs24
+
+	xsmuldp			vs40,	vs0,	vs25
+
+.endm
+
+.macro KERNEL2x1_1
+
+	lxsdx	vs8,	0,	AO
+
+	lxsdx	vs28,	0,	BO
+	lxsdx	vs29,	o8,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 16
+
+
+	xsmaddadp		vs32,	vs0,	vs24
+
+	xsmaddadp		vs40,	vs0,	vs25
+
+.endm
+
+.macro KERNEL2x1_2
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+	lxsdx	vs25,	o8,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 16
+
+
+	xsmaddadp		vs32,	vs8,	vs28
+
+	xsmaddadp		vs40,	vs8,	vs29
+
+.endm
+
+.macro KERNEL2x1_E2
+
+
+	xsmaddadp		vs32,	vs8,	vs28
+
+	xsmaddadp		vs40,	vs8,	vs29
+
+.endm
+
+.macro KERNEL2x1_SUBI1
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+	lxsdx	vs25,	o8,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 16
+
+
+	xsmuldp			vs32,	vs0,	vs24
+
+	xsmuldp			vs40,	vs0,	vs25
+
+.endm
+
+.macro KERNEL2x1_SUB1
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+	lxsdx	vs25,	o8,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 16
+
+
+	xsmaddadp		vs32,	vs0,	vs24
+
+	xsmaddadp		vs40,	vs0,	vs25
+
+.endm
+
+.macro SAVE2x1
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+	lxsdx		vs0,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xsmaddadp	vs0,	vs32,	alpha_r
+#else
+	xsmuldp		vs0,	vs32,	alpha_r
+#endif
+
+	stxsdx		vs0,	0,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxsdx		vs8,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xsmaddadp	vs8,	vs40,	alpha_r
+#else
+	xsmuldp		vs8,	vs40,	alpha_r
+#endif
+
+	stxsdx		vs8,	0,	T1
+
+	addi		CO,	CO,	8
+
+.endm
+
+/*********************************************************************
+* Macros for N=1, M=16                                               *
+*********************************************************************/
+
+.macro LOAD1x16_1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+	lxvd2x	vs4,	0,	AO
+	lxvd2x	vs5,	o16,	AO
+	lxvd2x	vs6,	o32,	AO
+	lxvd2x	vs7,	o48,	AO
+
+	addi		AO, AO, 64
+
+.endm
+
+.macro KERNEL1x16_I1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+	lxvd2x	vs10,	o32,	AO
+	lxvd2x	vs11,	o48,	AO
+
+	lxvdsx	vs28,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+	lxvd2x	vs12,	0,	AO
+	lxvd2x	vs13,	o16,	AO
+	lxvd2x	vs14,	o32,	AO
+	lxvd2x	vs15,	o48,	AO
+
+	addi		AO, AO, 64
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+	xvmuldp			vs34,	vs2,	vs24
+	xvmuldp			vs35,	vs3,	vs24
+	xvmuldp			vs36,	vs4,	vs24
+	xvmuldp			vs37,	vs5,	vs24
+	xvmuldp			vs38,	vs6,	vs24
+	xvmuldp			vs39,	vs7,	vs24
+
+.endm
+
+.macro KERNEL1x16_1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+	lxvd2x	vs10,	o32,	AO
+	lxvd2x	vs11,	o48,	AO
+
+	lxvdsx	vs28,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+	lxvd2x	vs12,	0,	AO
+	lxvd2x	vs13,	o16,	AO
+	lxvd2x	vs14,	o32,	AO
+	lxvd2x	vs15,	o48,	AO
+
+	addi		AO, AO, 64
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+	xvmaddadp		vs36,	vs4,	vs24
+	xvmaddadp		vs37,	vs5,	vs24
+	xvmaddadp		vs38,	vs6,	vs24
+	xvmaddadp		vs39,	vs7,	vs24
+
+.endm
+
+.macro KERNEL1x16_2
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+	lxvd2x	vs4,	0,	AO
+	lxvd2x	vs5,	o16,	AO
+	lxvd2x	vs6,	o32,	AO
+	lxvd2x	vs7,	o48,	AO
+
+	addi		AO, AO, 64
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+	xvmaddadp		vs34,	vs10,	vs28
+	xvmaddadp		vs35,	vs11,	vs28
+	xvmaddadp		vs36,	vs12,	vs28
+	xvmaddadp		vs37,	vs13,	vs28
+	xvmaddadp		vs38,	vs14,	vs28
+	xvmaddadp		vs39,	vs15,	vs28
+
+.endm
+
+.macro KERNEL1x16_E2
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+	xvmaddadp		vs34,	vs10,	vs28
+	xvmaddadp		vs35,	vs11,	vs28
+	xvmaddadp		vs36,	vs12,	vs28
+	xvmaddadp		vs37,	vs13,	vs28
+	xvmaddadp		vs38,	vs14,	vs28
+	xvmaddadp		vs39,	vs15,	vs28
+
+.endm
+
+.macro KERNEL1x16_SUBI1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+	lxvd2x	vs4,	0,	AO
+	lxvd2x	vs5,	o16,	AO
+	lxvd2x	vs6,	o32,	AO
+	lxvd2x	vs7,	o48,	AO
+
+	addi		AO, AO, 64
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+	xvmuldp			vs34,	vs2,	vs24
+	xvmuldp			vs35,	vs3,	vs24
+	xvmuldp			vs36,	vs4,	vs24
+	xvmuldp			vs37,	vs5,	vs24
+	xvmuldp			vs38,	vs6,	vs24
+	xvmuldp			vs39,	vs7,	vs24
+
+.endm
+
+.macro KERNEL1x16_SUB1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+	lxvd2x	vs4,	0,	AO
+	lxvd2x	vs5,	o16,	AO
+	lxvd2x	vs6,	o32,	AO
+	lxvd2x	vs7,	o48,	AO
+
+	addi		AO, AO, 64
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+	xvmaddadp		vs36,	vs4,	vs24
+	xvmaddadp		vs37,	vs5,	vs24
+	xvmaddadp		vs38,	vs6,	vs24
+	xvmaddadp		vs39,	vs7,	vs24
+
+.endm
+
+.macro SAVE1x16
+
+	mr		T1,	CO
+	addi		T2,	T1,	64
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+	lxvd2x		vs1,	o16,	T1
+	lxvd2x		vs2,	o32,	T1
+	lxvd2x		vs3,	o48,	T1
+
+	lxvd2x		vs4,	0,	T2
+	lxvd2x		vs5,	o16,	T2
+	lxvd2x		vs6,	o32,	T2
+	lxvd2x		vs7,	o48,	T2
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs32,	alpha_r
+	xvmaddadp	vs1,	vs33,	alpha_r
+	xvmaddadp	vs2,	vs34,	alpha_r
+	xvmaddadp	vs3,	vs35,	alpha_r
+	xvmaddadp	vs4,	vs36,	alpha_r
+	xvmaddadp	vs5,	vs37,	alpha_r
+	xvmaddadp	vs6,	vs38,	alpha_r
+	xvmaddadp	vs7,	vs39,	alpha_r
+#else
+	xvmuldp		vs0,	vs32,	alpha_r
+	xvmuldp		vs1,	vs33,	alpha_r
+	xvmuldp		vs2,	vs34,	alpha_r
+	xvmuldp		vs3,	vs35,	alpha_r
+	xvmuldp		vs4,	vs36,	alpha_r
+	xvmuldp		vs5,	vs37,	alpha_r
+	xvmuldp		vs6,	vs38,	alpha_r
+	xvmuldp		vs7,	vs39,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+	stxvd2x		vs1,	o16,	T1
+	stxvd2x		vs2,	o32,	T1
+	stxvd2x		vs3,	o48,	T1
+
+	stxvd2x		vs4,	0,	T2
+	stxvd2x		vs5,	o16,	T2
+	stxvd2x		vs6,	o32,	T2
+	stxvd2x		vs7,	o48,	T2
+
+	addi		CO,	CO,	128
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=8                                                *
+*********************************************************************/
+
+.macro LOAD1x8_1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+.endm
+
+.macro KERNEL1x8_I1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+	lxvd2x	vs10,	o32,	AO
+	lxvd2x	vs11,	o48,	AO
+
+	lxvdsx	vs28,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+	xvmuldp			vs34,	vs2,	vs24
+	xvmuldp			vs35,	vs3,	vs24
+
+.endm
+
+.macro KERNEL1x8_1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+	lxvd2x	vs10,	o32,	AO
+	lxvd2x	vs11,	o48,	AO
+
+	lxvdsx	vs28,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+
+.endm
+
+.macro KERNEL1x8_2
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+	xvmaddadp		vs34,	vs10,	vs28
+	xvmaddadp		vs35,	vs11,	vs28
+
+.endm
+
+.macro KERNEL1x8_E2
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+	xvmaddadp		vs34,	vs10,	vs28
+	xvmaddadp		vs35,	vs11,	vs28
+
+.endm
+
+.macro KERNEL1x8_SUBI1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+	xvmuldp			vs34,	vs2,	vs24
+	xvmuldp			vs35,	vs3,	vs24
+
+.endm
+
+.macro KERNEL1x8_SUB1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+
+.endm
+
+.macro SAVE1x8
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+	lxvd2x		vs1,	o16,	T1
+	lxvd2x		vs2,	o32,	T1
+	lxvd2x		vs3,	o48,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs32,	alpha_r
+	xvmaddadp	vs1,	vs33,	alpha_r
+	xvmaddadp	vs2,	vs34,	alpha_r
+	xvmaddadp	vs3,	vs35,	alpha_r
+#else
+	xvmuldp		vs0,	vs32,	alpha_r
+	xvmuldp		vs1,	vs33,	alpha_r
+	xvmuldp		vs2,	vs34,	alpha_r
+	xvmuldp		vs3,	vs35,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+	stxvd2x		vs1,	o16,	T1
+	stxvd2x		vs2,	o32,	T1
+	stxvd2x		vs3,	o48,	T1
+
+	addi		CO,	CO,	64
+
+.endm
+
+/*********************************************************************
+* Macros for N=1, M=4                                                *
+*********************************************************************/
+
+.macro LOAD1x4_1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 8
+
+.endm
+
+.macro KERNEL1x4_I1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+
+	lxvdsx	vs28,	0,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 8
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+
+.endm
+
+.macro KERNEL1x4_1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+
+	lxvdsx	vs28,	0,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 8
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+
+.endm
+
+.macro KERNEL1x4_2
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 8
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+
+.endm
+
+.macro KERNEL1x4_E2
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+
+.endm
+
+.macro KERNEL1x4_SUBI1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 8
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+
+.endm
+
+.macro KERNEL1x4_SUB1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 8
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+
+.endm
+
+.macro SAVE1x4
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+	lxvd2x		vs1,	o16,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs32,	alpha_r
+	xvmaddadp	vs1,	vs33,	alpha_r
+#else
+	xvmuldp		vs0,	vs32,	alpha_r
+	xvmuldp		vs1,	vs33,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+	stxvd2x		vs1,	o16,	T1
+
+	addi		CO,	CO,	32
+
+.endm
+
+/*********************************************************************
+* Macros for N=1, M=2                                                *
+*********************************************************************/
+
+.macro LOAD1x2_1
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 8
+
+.endm
+
+.macro KERNEL1x2_I1
+
+	lxvd2x	vs8,	0,	AO
+
+	lxvdsx	vs28,	0,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 8
+
+
+	xvmuldp			vs32,	vs0,	vs24
+
+.endm
+
+.macro KERNEL1x2_1
+
+	lxvd2x	vs8,	0,	AO
+
+	lxvdsx	vs28,	0,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 8
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+
+.endm
+
+.macro KERNEL1x2_2
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 8
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+
+.endm
+
+.macro KERNEL1x2_E2
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+
+.endm
+
+.macro KERNEL1x2_SUBI1
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 8
+
+
+	xvmuldp			vs32,	vs0,	vs24
+
+.endm
+
+.macro KERNEL1x2_SUB1
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 8
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+
+.endm
+
+.macro SAVE1x2
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs32,	alpha_r
+#else
+	xvmuldp		vs0,	vs32,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+
+	addi		CO,	CO,	16
+
+.endm
+
+/*********************************************************************
+* Macros for N=1, M=1                                                *
+*********************************************************************/
+
+.macro LOAD1x1_1
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 8
+
+.endm
+
+.macro KERNEL1x1_I1
+
+	lxsdx	vs8,	0,	AO
+
+	lxsdx	vs28,	0,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 8
+
+
+	xsmuldp			vs32,	vs0,	vs24
+
+.endm
+
+.macro KERNEL1x1_1
+
+	lxsdx	vs8,	0,	AO
+
+	lxsdx	vs28,	0,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 8
+
+
+	xsmaddadp		vs32,	vs0,	vs24
+
+.endm
+
+.macro KERNEL1x1_2
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 8
+
+
+	xsmaddadp		vs32,	vs8,	vs28
+
+.endm
+
+.macro KERNEL1x1_E2
+
+
+	xsmaddadp		vs32,	vs8,	vs28
+
+.endm
+
+.macro KERNEL1x1_SUBI1
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 8
+
+
+	xsmuldp			vs32,	vs0,	vs24
+
+.endm
+
+.macro KERNEL1x1_SUB1
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 8
+
+
+	xsmaddadp		vs32,	vs0,	vs24
+
+.endm
+
+.macro SAVE1x1
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+	lxsdx		vs0,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xsmaddadp	vs0,	vs32,	alpha_r
+#else
+	xsmuldp		vs0,	vs32,	alpha_r
+#endif
+
+	stxsdx		vs0,	0,	T1
+
+	addi		CO,	CO,	8
+
+.endm
+
diff --git a/kernel/power/dtrmm_kernel_16x4_power8.S b/kernel/power/dtrmm_kernel_16x4_power8.S
new file mode 100644
index 0000000..c892c65
--- /dev/null
+++ b/kernel/power/dtrmm_kernel_16x4_power8.S
@@ -0,0 +1,327 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+#ifndef __64BIT__
+#define LOAD	lwz
+#else
+#define LOAD	ld
+#endif
+
+#ifdef __64BIT__
+#define STACKSIZE 320
+#define ALPHA_SP   296(SP)
+#define FZERO	304(SP)
+#else
+#define STACKSIZE 240
+#define ALPHA_SP   224(SP)
+#define FZERO	232(SP)
+#endif
+
+#define	M	r3
+#define	N	r4
+#define	K	r5
+
+#ifdef linux
+#ifndef __64BIT__
+#define A	r6
+#define	B	r7
+#define	C	r8
+#define	LDC	r9
+#define OFFSET	r10
+#else
+#define A	r7
+#define	B	r8
+#define	C	r9
+#define	LDC	r10
+#define OFFSET	r6
+#endif
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#if !defined(__64BIT__) && defined(DOUBLE)
+#define A	r8
+#define	B	r9
+#define	C	r10
+#define	LDC	r7
+#define OFFSET	r6
+#else
+#define A	r7
+#define	B	r8
+#define	C	r9
+#define	LDC	r10
+#define OFFSET	r6
+#endif
+#endif
+
+#define alpha_r vs18
+
+#define o0	0
+
+#define K1	r13
+#define KKK	r14
+#define o8	r15
+#define o24	r16
+#define ALPHA	r17
+#define L	r18
+#define T1	r19
+#define KK	r20
+#define BB	r21
+#define	I	r22
+#define J	r23
+#define AO	r24
+#define	BO	r25
+#define	CO 	r26
+#define o16	r27
+#define	o32	r28
+#define	o48	r29
+
+#define PRE	r30
+#define T2	r31
+
+#include "dgemm_macros_16x4_power8.S"
+
+
+#ifndef NEEDPARAM
+
+	PROLOGUE
+	PROFCODE
+
+	addi	SP, SP, -STACKSIZE
+	li	r0, 0
+
+	stfd	f14,    0(SP)
+	stfd	f15,    8(SP)
+	stfd	f16,   16(SP)
+	stfd	f17,   24(SP)
+
+	stfd	f18,   32(SP)
+	stfd	f19,   40(SP)
+	stfd	f20,   48(SP)
+	stfd	f21,   56(SP)
+
+	stfd	f22,   64(SP)
+	stfd	f23,   72(SP)
+	stfd	f24,   80(SP)
+	stfd	f25,   88(SP)
+
+	stfd	f26,   96(SP)
+	stfd	f27,  104(SP)
+	stfd	f28,  112(SP)
+	stfd	f29,  120(SP)
+
+	stfd	f30,  128(SP)
+	stfd	f31,  136(SP)
+
+#ifdef __64BIT__
+	std	r31,  144(SP)
+	std	r30,  152(SP)
+	std	r29,  160(SP)
+	std	r28,  168(SP)
+	std	r27,  176(SP)
+	std	r26,  184(SP)
+	std	r25,  192(SP)
+	std	r24,  200(SP)
+	std	r23,  208(SP)
+	std	r22,  216(SP)
+	std	r21,  224(SP)
+	std	r20,  232(SP)
+	std	r19,  240(SP)
+	std	r18,  248(SP)
+	std	r17,  256(SP)
+	std	r16,  264(SP)
+	std	r15,  272(SP)
+	std	r14,  280(SP)
+	std	r13,  288(SP)
+#else
+	stw	r31,  144(SP)
+	stw	r30,  148(SP)
+	stw	r29,  152(SP)
+	stw	r28,  156(SP)
+	stw	r27,  160(SP)
+	stw	r26,  164(SP)
+	stw	r25,  168(SP)
+	stw	r24,  172(SP)
+	stw	r23,  176(SP)
+	stw	r22,  180(SP)
+	stw	r21,  184(SP)
+	stw	r20,  188(SP)
+	stw	r19,  192(SP)
+	stw	r18,  196(SP)
+	stw	r17,  200(SP)
+	stw	r16,  204(SP)
+	stw	r15,  208(SP)
+	stw	r14,  212(SP)
+	stw	r13,  216(SP)
+#endif
+
+	stfd	f1,  ALPHA_SP
+	stw	r0,  FZERO
+
+#if defined(_AIX) || defined(__APPLE__)
+#if !defined(__64BIT__) && defined(DOUBLE)
+	lwz	LDC, FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+#endif
+
+	slwi	LDC, LDC, BASE_SHIFT
+
+#if defined(TRMMKERNEL)
+#if defined(linux) && defined(__64BIT__)
+	ld	OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#ifdef __64BIT__
+	ld	OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
+#else
+#ifdef DOUBLE
+	lwz	OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
+#else
+	lwz	OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+#endif
+#endif
+#endif
+
+	mr	KK, OFFSET
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        neg     KK, KK
+#endif
+
+	cmpwi	cr0, M, 0
+	ble	L999_H1
+	cmpwi	cr0, N, 0
+	ble	L999_H1
+	cmpwi	cr0, K, 0
+	ble	L999_H1
+
+#ifdef __64BIT__
+	addi	ALPHA, SP, 296
+#else
+	addi	ALPHA, SP, 224
+#endif
+
+	li	PRE, 256 
+	li	o8 , 8
+	li	o16, 16
+	li	o24, 24
+	li	o32, 32
+	li	o48, 48
+
+	lxvdsx	alpha_r, 0, ALPHA
+
+#include "dtrmm_logic_16x4_power8.S"
+
+L999:
+	addi	r3, 0, 0
+
+	lfd	f14,    0(SP)
+	lfd	f15,    8(SP)
+	lfd	f16,   16(SP)
+	lfd	f17,   24(SP)
+
+	lfd	f18,   32(SP)
+	lfd	f19,   40(SP)
+	lfd	f20,   48(SP)
+	lfd	f21,   56(SP)
+
+	lfd	f22,   64(SP)
+	lfd	f23,   72(SP)
+	lfd	f24,   80(SP)
+	lfd	f25,   88(SP)
+
+	lfd	f26,   96(SP)
+	lfd	f27,  104(SP)
+	lfd	f28,  112(SP)
+	lfd	f29,  120(SP)
+
+	lfd	f30,  128(SP)
+	lfd	f31,  136(SP)
+
+#ifdef __64BIT__
+	ld	r31,  144(SP)
+	ld	r30,  152(SP)
+	ld	r29,  160(SP)
+	ld	r28,  168(SP)
+	ld	r27,  176(SP)
+	ld	r26,  184(SP)
+	ld	r25,  192(SP)
+	ld	r24,  200(SP)
+	ld	r23,  208(SP)
+	ld	r22,  216(SP)
+	ld	r21,  224(SP)
+	ld	r20,  232(SP)
+	ld	r19,  240(SP)
+	ld	r18,  248(SP)
+	ld	r17,  256(SP)
+	ld	r16,  264(SP)
+	ld	r15,  272(SP)
+	ld	r14,  280(SP)
+	ld	r13,  288(SP)
+#else
+	lwz	r31,  144(SP)
+	lwz	r30,  148(SP)
+	lwz	r29,  152(SP)
+	lwz	r28,  156(SP)
+	lwz	r27,  160(SP)
+	lwz	r26,  164(SP)
+	lwz	r25,  168(SP)
+	lwz	r24,  172(SP)
+	lwz	r23,  176(SP)
+	lwz	r22,  180(SP)
+	lwz	r21,  184(SP)
+	lwz	r20,  188(SP)
+	lwz	r19,  192(SP)
+	lwz	r18,  196(SP)
+	lwz	r17,  200(SP)
+	lwz	r16,  204(SP)
+	lwz	r15,  208(SP)
+	lwz	r14,  212(SP)
+	lwz	r13,  216(SP)
+#endif
+
+	addi	SP, SP, STACKSIZE
+
+	blr
+
+	EPILOGUE
+#endif
diff --git a/kernel/power/dtrmm_logic_16x4_power8.S b/kernel/power/dtrmm_logic_16x4_power8.S
new file mode 100644
index 0000000..f2886f8
--- /dev/null
+++ b/kernel/power/dtrmm_logic_16x4_power8.S
@@ -0,0 +1,2202 @@
+	srawi.		J,	N,	2
+	ble		DTRMM_L4_END
+
+DTRMM_L4_BEGIN:
+
+	mr		CO,	C
+	mr		AO,	A
+	slwi		T1,	LDC	,	2
+	add		C,	C,	T1
+
+#if defined(LEFT)
+	mr		KK,	OFFSET		// OFFSET -> KK
+#endif
+
+	srawi.		I,	M,	4
+	ble		DTRMM_L4x16_END
+
+DTRMM_L4x16_BEGIN:
+
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	5				// Number of values in B shifted
+	slwi		T2,	KK,	7				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	16				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	4				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		DTRMM_L4x16_SUB0
+	cmpwi		cr0,	L,	1
+	ble		DTRMM_L4x16_SUB4
+
+DTRMM_L4x16_LOOP_START:
+
+	dcbt		AO,	PRE
+	LOAD4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_I1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+	dcbt		AO,	PRE
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+
+	dcbt		AO,	PRE
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+	dcbt		AO,	PRE
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+
+	addic.		L,	L,	-2
+	ble		DTRMM_L4x16_LOOP_END
+
+	.align 5
+
+DTRMM_L4x16_LOOP:
+
+	dcbt		AO,	PRE
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+	dcbt		AO,	PRE
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+
+	dcbt		AO,	PRE
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+	dcbt		AO,	PRE
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+
+	addic.		L,	L,	-1
+	bgt		DTRMM_L4x16_LOOP
+
+DTRMM_L4x16_LOOP_END:
+
+	dcbt		AO,	PRE
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+	dcbt		AO,	PRE
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+
+	dcbt		AO,	PRE
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+	dcbt		AO,	PRE
+	KERNEL4x16_1
+	KERNEL4x16_E2
+
+	b		DTRMM_L4x16_SUB1
+
+DTRMM_L4x16_SUB4:
+
+	dcbt		AO,	PRE
+	KERNEL4x16_SUBI1
+	dcbt		AO,	PRE
+	KERNEL4x16_SUB1
+	dcbt		AO,	PRE
+	KERNEL4x16_SUB1
+	dcbt		AO,	PRE
+	KERNEL4x16_SUB1
+
+	KERNEL4x16_SUB1
+	KERNEL4x16_SUB1
+	KERNEL4x16_SUB1
+	KERNEL4x16_SUB1
+
+	b		DTRMM_L4x16_SUB1
+
+DTRMM_L4x16_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL4x16_SUBI1
+
+	addic.		L,	L,	-1
+	ble		DTRMM_L4x16_SAVE
+	b		DTRMM_L4x16_SUB2
+
+DTRMM_L4x16_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		DTRMM_L4x16_SAVE
+
+DTRMM_L4x16_SUB2:
+
+	KERNEL4x16_SUB1
+
+	addic.		L,	L,	-1
+	bgt		DTRMM_L4x16_SUB2
+
+DTRMM_L4x16_SAVE:
+
+	SAVE4x16
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	5			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	7			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	16				// KK += Number of values in A
+#endif
+
+
+	addic.		I,	I,	-1
+	bgt		DTRMM_L4x16_BEGIN
+
+DTRMM_L4x16_END:
+
+DTRMM_L4x8_BEGIN:
+	andi.		T2,	M,	15
+	ble		DTRMM_L4x1_END
+
+	andi.		T1,	M,	8
+	ble		DTRMM_L4x8_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	5				// Number of values in B shifted
+	slwi		T2,	KK,	6				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	8				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	4				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		DTRMM_L4x8_SUB0
+	cmpwi		cr0,	L,	1
+	ble		DTRMM_L4x8_SUB4
+
+DTRMM_L4x8_LOOP_START:
+
+	LOAD4x8_1
+	KERNEL4x8_I1
+	KERNEL4x8_2
+	KERNEL4x8_1
+	KERNEL4x8_2
+
+	KERNEL4x8_1
+	KERNEL4x8_2
+	KERNEL4x8_1
+	KERNEL4x8_2
+
+	addic.		L,	L,	-2
+	ble		DTRMM_L4x8_LOOP_END
+
+	.align 5
+
+DTRMM_L4x8_LOOP:
+
+	KERNEL4x8_1
+	KERNEL4x8_2
+	KERNEL4x8_1
+	KERNEL4x8_2
+
+	KERNEL4x8_1
+	KERNEL4x8_2
+	KERNEL4x8_1
+	KERNEL4x8_2
+
+	addic.		L,	L,	-1
+	bgt		DTRMM_L4x8_LOOP
+
+DTRMM_L4x8_LOOP_END:
+
+	KERNEL4x8_1
+	KERNEL4x8_2
+	KERNEL4x8_1
+	KERNEL4x8_2
+
+	KERNEL4x8_1
+	KERNEL4x8_2
+	KERNEL4x8_1
+	KERNEL4x8_E2
+
+	b		DTRMM_L4x8_SUB1
+
+DTRMM_L4x8_SUB4:
+
+	KERNEL4x8_SUBI1
+	KERNEL4x8_SUB1
+	KERNEL4x8_SUB1
+	KERNEL4x8_SUB1
+
+	KERNEL4x8_SUB1
+	KERNEL4x8_SUB1
+	KERNEL4x8_SUB1
+	KERNEL4x8_SUB1
+
+	b		DTRMM_L4x8_SUB1
+
+DTRMM_L4x8_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL4x8_SUBI1
+
+	addic.		L,	L,	-1
+	ble		DTRMM_L4x8_SAVE
+	b		DTRMM_L4x8_SUB2
+
+DTRMM_L4x8_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		DTRMM_L4x8_SAVE
+
+DTRMM_L4x8_SUB2:
+
+	KERNEL4x8_SUB1
+
+	addic.		L,	L,	-1
+	bgt		DTRMM_L4x8_SUB2
+
+DTRMM_L4x8_SAVE:
+
+	SAVE4x8
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	5			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	6			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	8				// KK += Number of values in A
+#endif
+
+
+DTRMM_L4x8_END:
+
+DTRMM_L4x4_BEGIN:
+
+	andi.		T1,	M,	4
+	ble		DTRMM_L4x4_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	5				// Number of values in B shifted
+	slwi		T2,	KK,	5				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	4				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	4				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		DTRMM_L4x4_SUB0
+	cmpwi		cr0,	L,	1
+	ble		DTRMM_L4x4_SUB4
+
+DTRMM_L4x4_LOOP_START:
+
+	LOAD4x4_1
+	KERNEL4x4_I1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	KERNEL4x4_2
+
+	KERNEL4x4_1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	KERNEL4x4_2
+
+	addic.		L,	L,	-2
+	ble		DTRMM_L4x4_LOOP_END
+
+	.align 5
+
+DTRMM_L4x4_LOOP:
+
+	KERNEL4x4_1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	KERNEL4x4_2
+
+	KERNEL4x4_1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	KERNEL4x4_2
+
+	addic.		L,	L,	-1
+	bgt		DTRMM_L4x4_LOOP
+
+DTRMM_L4x4_LOOP_END:
+
+	KERNEL4x4_1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	KERNEL4x4_2
+
+	KERNEL4x4_1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	KERNEL4x4_E2
+
+	b		DTRMM_L4x4_SUB1
+
+DTRMM_L4x4_SUB4:
+
+	KERNEL4x4_SUBI1
+	KERNEL4x4_SUB1
+	KERNEL4x4_SUB1
+	KERNEL4x4_SUB1
+
+	KERNEL4x4_SUB1
+	KERNEL4x4_SUB1
+	KERNEL4x4_SUB1
+	KERNEL4x4_SUB1
+
+	b		DTRMM_L4x4_SUB1
+
+DTRMM_L4x4_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL4x4_SUBI1
+
+	addic.		L,	L,	-1
+	ble		DTRMM_L4x4_SAVE
+	b		DTRMM_L4x4_SUB2
+
+DTRMM_L4x4_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		DTRMM_L4x4_SAVE
+
+DTRMM_L4x4_SUB2:
+
+	KERNEL4x4_SUB1
+
+	addic.		L,	L,	-1
+	bgt		DTRMM_L4x4_SUB2
+
+DTRMM_L4x4_SAVE:
+
+	SAVE4x4
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	5			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	5			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	4				// KK += Number of values in A
+#endif
+
+
+DTRMM_L4x4_END:
+
+DTRMM_L4x2_BEGIN:
+
+	andi.		T1,	M,	2
+	ble		DTRMM_L4x2_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	5				// Number of values in B shifted
+	slwi		T2,	KK,	4				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	2				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	4				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		DTRMM_L4x2_SUB0
+	cmpwi		cr0,	L,	1
+	ble		DTRMM_L4x2_SUB4
+
+DTRMM_L4x2_LOOP_START:
+
+	LOAD4x2_1
+	KERNEL4x2_I1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_2
+
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_2
+
+	addic.		L,	L,	-2
+	ble		DTRMM_L4x2_LOOP_END
+
+	.align 5
+
+DTRMM_L4x2_LOOP:
+
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_2
+
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_2
+
+	addic.		L,	L,	-1
+	bgt		DTRMM_L4x2_LOOP
+
+DTRMM_L4x2_LOOP_END:
+
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_2
+
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_E2
+
+	b		DTRMM_L4x2_SUB1
+
+DTRMM_L4x2_SUB4:
+
+	KERNEL4x2_SUBI1
+	KERNEL4x2_SUB1
+	KERNEL4x2_SUB1
+	KERNEL4x2_SUB1
+
+	KERNEL4x2_SUB1
+	KERNEL4x2_SUB1
+	KERNEL4x2_SUB1
+	KERNEL4x2_SUB1
+
+	b		DTRMM_L4x2_SUB1
+
+DTRMM_L4x2_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL4x2_SUBI1
+
+	addic.		L,	L,	-1
+	ble		DTRMM_L4x2_SAVE
+	b		DTRMM_L4x2_SUB2
+
+DTRMM_L4x2_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		DTRMM_L4x2_SAVE
+
+DTRMM_L4x2_SUB2:
+
+	KERNEL4x2_SUB1
+
+	addic.		L,	L,	-1
+	bgt		DTRMM_L4x2_SUB2
+
+DTRMM_L4x2_SAVE:
+
+	SAVE4x2
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	5			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	4			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	2				// KK += Number of values in A
+#endif
+
+
+DTRMM_L4x2_END:
+
+DTRMM_L4x1_BEGIN:
+
+	andi.		T1,	M,	1
+	ble		DTRMM_L4x1_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	5				// Number of values in B shifted
+	slwi		T2,	KK,	3				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	1				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	4				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		DTRMM_L4x1_SUB0
+	cmpwi		cr0,	L,	1
+	ble		DTRMM_L4x1_SUB4
+
+DTRMM_L4x1_LOOP_START:
+
+	LOAD4x1_1
+	KERNEL4x1_I1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_2
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_2
+
+	addic.		L,	L,	-2
+	ble		DTRMM_L4x1_LOOP_END
+
+	.align 5
+
+DTRMM_L4x1_LOOP:
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_2
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_2
+
+	addic.		L,	L,	-1
+	bgt		DTRMM_L4x1_LOOP
+
+DTRMM_L4x1_LOOP_END:
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_2
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_E2
+
+	b		DTRMM_L4x1_SUB1
+
+DTRMM_L4x1_SUB4:
+
+	KERNEL4x1_SUBI1
+	KERNEL4x1_SUB1
+	KERNEL4x1_SUB1
+	KERNEL4x1_SUB1
+
+	KERNEL4x1_SUB1
+	KERNEL4x1_SUB1
+	KERNEL4x1_SUB1
+	KERNEL4x1_SUB1
+
+	b		DTRMM_L4x1_SUB1
+
+DTRMM_L4x1_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL4x1_SUBI1
+
+	addic.		L,	L,	-1
+	ble		DTRMM_L4x1_SAVE
+	b		DTRMM_L4x1_SUB2
+
+DTRMM_L4x1_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		DTRMM_L4x1_SAVE
+
+DTRMM_L4x1_SUB2:
+
+	KERNEL4x1_SUB1
+
+	addic.		L,	L,	-1
+	bgt		DTRMM_L4x1_SUB2
+
+DTRMM_L4x1_SAVE:
+
+	SAVE4x1
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	5			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	3			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	1				// KK += Number of values in A
+#endif
+
+
+DTRMM_L4x1_END:
+
+	slwi		T1,	K,	5
+	add		B,	B,	T1
+
+#if !defined(LEFT)
+	addi		KK,	KK,	4					// KK += Number of values in B
+#endif
+
+
+	addic.		J,	J,	-1
+	bgt		DTRMM_L4_BEGIN
+
+	andi.		T2,	N,	3
+	ble		L999
+
+DTRMM_L4_END:
+
+	b		DTRMM_L2_BEGIN
+
+L999_H1:
+
+	b		L999
+
+DTRMM_L2_BEGIN:
+
+	andi.		T1,	N,	2
+	ble		DTRMM_L2_END
+	mr		CO,	C
+	mr		AO,	A
+	slwi		T1,	LDC	,	1
+	add		C,	C,	T1
+
+#if defined(LEFT)
+	mr		KK,	OFFSET		// OFFSET -> KK
+#endif
+
+	srawi.		I,	M,	4
+	ble		DTRMM_L2x16_END
+
+DTRMM_L2x16_BEGIN:
+
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	4				// Number of values in B shifted
+	slwi		T2,	KK,	7				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	16				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	2				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		DTRMM_L2x16_SUB0
+	cmpwi		cr0,	L,	1
+	ble		DTRMM_L2x16_SUB4
+
+DTRMM_L2x16_LOOP_START:
+
+	dcbt		AO,	PRE
+	LOAD2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_I1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+	dcbt		AO,	PRE
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+
+	dcbt		AO,	PRE
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+	dcbt		AO,	PRE
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+
+	addic.		L,	L,	-2
+	ble		DTRMM_L2x16_LOOP_END
+
+	.align 5
+
+DTRMM_L2x16_LOOP:
+
+	dcbt		AO,	PRE
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+	dcbt		AO,	PRE
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+
+	dcbt		AO,	PRE
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+	dcbt		AO,	PRE
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+
+	addic.		L,	L,	-1
+	bgt		DTRMM_L2x16_LOOP
+
+DTRMM_L2x16_LOOP_END:
+
+	dcbt		AO,	PRE
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+	dcbt		AO,	PRE
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+
+	dcbt		AO,	PRE
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+	dcbt		AO,	PRE
+	KERNEL2x16_1
+	KERNEL2x16_E2
+
+	b		DTRMM_L2x16_SUB1
+
+DTRMM_L2x16_SUB4:
+
+	dcbt		AO,	PRE
+	KERNEL2x16_SUBI1
+	dcbt		AO,	PRE
+	KERNEL2x16_SUB1
+	dcbt		AO,	PRE
+	KERNEL2x16_SUB1
+	dcbt		AO,	PRE
+	KERNEL2x16_SUB1
+
+	KERNEL2x16_SUB1
+	KERNEL2x16_SUB1
+	KERNEL2x16_SUB1
+	KERNEL2x16_SUB1
+
+	b		DTRMM_L2x16_SUB1
+
+DTRMM_L2x16_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL2x16_SUBI1
+
+	addic.		L,	L,	-1
+	ble		DTRMM_L2x16_SAVE
+	b		DTRMM_L2x16_SUB2
+
+DTRMM_L2x16_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		DTRMM_L2x16_SAVE
+
+DTRMM_L2x16_SUB2:
+
+	KERNEL2x16_SUB1
+
+	addic.		L,	L,	-1
+	bgt		DTRMM_L2x16_SUB2
+
+DTRMM_L2x16_SAVE:
+
+	SAVE2x16
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	4			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	7			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	16				// KK += Number of values in A
+#endif
+
+
+	addic.		I,	I,	-1
+	bgt		DTRMM_L2x16_BEGIN
+
+DTRMM_L2x16_END:
+
+DTRMM_L2x8_BEGIN:
+	andi.		T2,	M,	15
+	ble		DTRMM_L2x1_END
+
+	andi.		T1,	M,	8
+	ble		DTRMM_L2x8_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	4				// Number of values in B shifted
+	slwi		T2,	KK,	6				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	8				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	2				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		DTRMM_L2x8_SUB0
+	cmpwi		cr0,	L,	1
+	ble		DTRMM_L2x8_SUB4
+
+DTRMM_L2x8_LOOP_START:
+
+	LOAD2x8_1
+	KERNEL2x8_I1
+	KERNEL2x8_2
+	KERNEL2x8_1
+	KERNEL2x8_2
+
+	KERNEL2x8_1
+	KERNEL2x8_2
+	KERNEL2x8_1
+	KERNEL2x8_2
+
+	addic.		L,	L,	-2
+	ble		DTRMM_L2x8_LOOP_END
+
+	.align 5
+
+DTRMM_L2x8_LOOP:
+
+	KERNEL2x8_1
+	KERNEL2x8_2
+	KERNEL2x8_1
+	KERNEL2x8_2
+
+	KERNEL2x8_1
+	KERNEL2x8_2
+	KERNEL2x8_1
+	KERNEL2x8_2
+
+	addic.		L,	L,	-1
+	bgt		DTRMM_L2x8_LOOP
+
+DTRMM_L2x8_LOOP_END:
+
+	KERNEL2x8_1
+	KERNEL2x8_2
+	KERNEL2x8_1
+	KERNEL2x8_2
+
+	KERNEL2x8_1
+	KERNEL2x8_2
+	KERNEL2x8_1
+	KERNEL2x8_E2
+
+	b		DTRMM_L2x8_SUB1
+
+DTRMM_L2x8_SUB4:
+
+	KERNEL2x8_SUBI1
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+
+	b		DTRMM_L2x8_SUB1
+
+DTRMM_L2x8_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL2x8_SUBI1
+
+	addic.		L,	L,	-1
+	ble		DTRMM_L2x8_SAVE
+	b		DTRMM_L2x8_SUB2
+
+DTRMM_L2x8_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		DTRMM_L2x8_SAVE
+
+DTRMM_L2x8_SUB2:
+
+	KERNEL2x8_SUB1
+
+	addic.		L,	L,	-1
+	bgt		DTRMM_L2x8_SUB2
+
+DTRMM_L2x8_SAVE:
+
+	SAVE2x8
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	4			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	6			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	8				// KK += Number of values in A
+#endif
+
+
+DTRMM_L2x8_END:
+
+DTRMM_L2x4_BEGIN:
+
+	andi.		T1,	M,	4
+	ble		DTRMM_L2x4_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	4				// Number of values in B shifted
+	slwi		T2,	KK,	5				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	4				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	2				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		DTRMM_L2x4_SUB0
+	cmpwi		cr0,	L,	1
+	ble		DTRMM_L2x4_SUB4
+
+DTRMM_L2x4_LOOP_START:
+
+	LOAD2x4_1
+	KERNEL2x4_I1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	addic.		L,	L,	-2
+	ble		DTRMM_L2x4_LOOP_END
+
+	.align 5
+
+DTRMM_L2x4_LOOP:
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	addic.		L,	L,	-1
+	bgt		DTRMM_L2x4_LOOP
+
+DTRMM_L2x4_LOOP_END:
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_E2
+
+	b		DTRMM_L2x4_SUB1
+
+DTRMM_L2x4_SUB4:
+
+	KERNEL2x4_SUBI1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+
+	b		DTRMM_L2x4_SUB1
+
+DTRMM_L2x4_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL2x4_SUBI1
+
+	addic.		L,	L,	-1
+	ble		DTRMM_L2x4_SAVE
+	b		DTRMM_L2x4_SUB2
+
+DTRMM_L2x4_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		DTRMM_L2x4_SAVE
+
+DTRMM_L2x4_SUB2:
+
+	KERNEL2x4_SUB1
+
+	addic.		L,	L,	-1
+	bgt		DTRMM_L2x4_SUB2
+
+DTRMM_L2x4_SAVE:
+
+	SAVE2x4
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	4			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	5			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	4				// KK += Number of values in A
+#endif
+
+
+DTRMM_L2x4_END:
+
+DTRMM_L2x2_BEGIN:
+
+	andi.		T1,	M,	2
+	ble		DTRMM_L2x2_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	4				// Number of values in B shifted
+	slwi		T2,	KK,	4				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	2				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	2				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		DTRMM_L2x2_SUB0
+	cmpwi		cr0,	L,	1
+	ble		DTRMM_L2x2_SUB4
+
+DTRMM_L2x2_LOOP_START:
+
+	LOAD2x2_1
+	KERNEL2x2_I1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	addic.		L,	L,	-2
+	ble		DTRMM_L2x2_LOOP_END
+
+	.align 5
+
+DTRMM_L2x2_LOOP:
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	addic.		L,	L,	-1
+	bgt		DTRMM_L2x2_LOOP
+
+DTRMM_L2x2_LOOP_END:
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_E2
+
+	b		DTRMM_L2x2_SUB1
+
+DTRMM_L2x2_SUB4:
+
+	KERNEL2x2_SUBI1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+
+	b		DTRMM_L2x2_SUB1
+
+DTRMM_L2x2_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL2x2_SUBI1
+
+	addic.		L,	L,	-1
+	ble		DTRMM_L2x2_SAVE
+	b		DTRMM_L2x2_SUB2
+
+DTRMM_L2x2_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		DTRMM_L2x2_SAVE
+
+DTRMM_L2x2_SUB2:
+
+	KERNEL2x2_SUB1
+
+	addic.		L,	L,	-1
+	bgt		DTRMM_L2x2_SUB2
+
+DTRMM_L2x2_SAVE:
+
+	SAVE2x2
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	4			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	4			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	2				// KK += Number of values in A
+#endif
+
+
+DTRMM_L2x2_END:
+
+DTRMM_L2x1_BEGIN:
+
+	andi.		T1,	M,	1
+	ble		DTRMM_L2x1_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	4				// Number of values in B shifted
+	slwi		T2,	KK,	3				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	1				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	2				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		DTRMM_L2x1_SUB0
+	cmpwi		cr0,	L,	1
+	ble		DTRMM_L2x1_SUB4
+
+DTRMM_L2x1_LOOP_START:
+
+	LOAD2x1_1
+	KERNEL2x1_I1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	addic.		L,	L,	-2
+	ble		DTRMM_L2x1_LOOP_END
+
+	.align 5
+
+DTRMM_L2x1_LOOP:
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	addic.		L,	L,	-1
+	bgt		DTRMM_L2x1_LOOP
+
+DTRMM_L2x1_LOOP_END:
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_E2
+
+	b		DTRMM_L2x1_SUB1
+
+DTRMM_L2x1_SUB4:
+
+	KERNEL2x1_SUBI1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+
+	b		DTRMM_L2x1_SUB1
+
+DTRMM_L2x1_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL2x1_SUBI1
+
+	addic.		L,	L,	-1
+	ble		DTRMM_L2x1_SAVE
+	b		DTRMM_L2x1_SUB2
+
+DTRMM_L2x1_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		DTRMM_L2x1_SAVE
+
+DTRMM_L2x1_SUB2:
+
+	KERNEL2x1_SUB1
+
+	addic.		L,	L,	-1
+	bgt		DTRMM_L2x1_SUB2
+
+DTRMM_L2x1_SAVE:
+
+	SAVE2x1
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	4			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	3			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	1				// KK += Number of values in A
+#endif
+
+
+DTRMM_L2x1_END:
+
+	slwi		T1,	K,	4
+	add		B,	B,	T1
+
+#if !defined(LEFT)
+	addi		KK,	KK,	2					// KK += Number of values in B
+#endif
+
+
+DTRMM_L2_END:
+DTRMM_L1_BEGIN:
+
+	andi.		T1,	N,	1
+	ble		DTRMM_L1_END
+	mr		CO,	C
+	mr		AO,	A
+
+#if defined(LEFT)
+	mr		KK,	OFFSET		// OFFSET -> KK
+#endif
+
+	srawi.		I,	M,	4
+	ble		DTRMM_L1x16_END
+
+DTRMM_L1x16_BEGIN:
+
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	3				// Number of values in B shifted
+	slwi		T2,	KK,	7				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	16				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	1				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		DTRMM_L1x16_SUB0
+	cmpwi		cr0,	L,	1
+	ble		DTRMM_L1x16_SUB4
+
+DTRMM_L1x16_LOOP_START:
+
+	dcbt		AO,	PRE
+	LOAD1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_I1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+	dcbt		AO,	PRE
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+
+	dcbt		AO,	PRE
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+	dcbt		AO,	PRE
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+
+	addic.		L,	L,	-2
+	ble		DTRMM_L1x16_LOOP_END
+
+	.align 5
+
+DTRMM_L1x16_LOOP:
+
+	dcbt		AO,	PRE
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+	dcbt		AO,	PRE
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+
+	dcbt		AO,	PRE
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+	dcbt		AO,	PRE
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+
+	addic.		L,	L,	-1
+	bgt		DTRMM_L1x16_LOOP
+
+DTRMM_L1x16_LOOP_END:
+
+	dcbt		AO,	PRE
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+	dcbt		AO,	PRE
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+
+	dcbt		AO,	PRE
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+	dcbt		AO,	PRE
+	KERNEL1x16_1
+	KERNEL1x16_E2
+
+	b		DTRMM_L1x16_SUB1
+
+DTRMM_L1x16_SUB4:
+
+	dcbt		AO,	PRE
+	KERNEL1x16_SUBI1
+	dcbt		AO,	PRE
+	KERNEL1x16_SUB1
+	dcbt		AO,	PRE
+	KERNEL1x16_SUB1
+	dcbt		AO,	PRE
+	KERNEL1x16_SUB1
+
+	KERNEL1x16_SUB1
+	KERNEL1x16_SUB1
+	KERNEL1x16_SUB1
+	KERNEL1x16_SUB1
+
+	b		DTRMM_L1x16_SUB1
+
+DTRMM_L1x16_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL1x16_SUBI1
+
+	addic.		L,	L,	-1
+	ble		DTRMM_L1x16_SAVE
+	b		DTRMM_L1x16_SUB2
+
+DTRMM_L1x16_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		DTRMM_L1x16_SAVE
+
+DTRMM_L1x16_SUB2:
+
+	KERNEL1x16_SUB1
+
+	addic.		L,	L,	-1
+	bgt		DTRMM_L1x16_SUB2
+
+DTRMM_L1x16_SAVE:
+
+	SAVE1x16
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	3			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	7			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	16				// KK += Number of values in A
+#endif
+
+
+	addic.		I,	I,	-1
+	bgt		DTRMM_L1x16_BEGIN
+
+DTRMM_L1x16_END:
+
+DTRMM_L1x8_BEGIN:
+	andi.		T2,	M,	15
+	ble		DTRMM_L1x1_END
+
+	andi.		T1,	M,	8
+	ble		DTRMM_L1x8_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	3				// Number of values in B shifted
+	slwi		T2,	KK,	6				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	8				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	1				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		DTRMM_L1x8_SUB0
+	cmpwi		cr0,	L,	1
+	ble		DTRMM_L1x8_SUB4
+
+DTRMM_L1x8_LOOP_START:
+
+	LOAD1x8_1
+	KERNEL1x8_I1
+	KERNEL1x8_2
+	KERNEL1x8_1
+	KERNEL1x8_2
+
+	KERNEL1x8_1
+	KERNEL1x8_2
+	KERNEL1x8_1
+	KERNEL1x8_2
+
+	addic.		L,	L,	-2
+	ble		DTRMM_L1x8_LOOP_END
+
+	.align 5
+
+DTRMM_L1x8_LOOP:
+
+	KERNEL1x8_1
+	KERNEL1x8_2
+	KERNEL1x8_1
+	KERNEL1x8_2
+
+	KERNEL1x8_1
+	KERNEL1x8_2
+	KERNEL1x8_1
+	KERNEL1x8_2
+
+	addic.		L,	L,	-1
+	bgt		DTRMM_L1x8_LOOP
+
+DTRMM_L1x8_LOOP_END:
+
+	KERNEL1x8_1
+	KERNEL1x8_2
+	KERNEL1x8_1
+	KERNEL1x8_2
+
+	KERNEL1x8_1
+	KERNEL1x8_2
+	KERNEL1x8_1
+	KERNEL1x8_E2
+
+	b		DTRMM_L1x8_SUB1
+
+DTRMM_L1x8_SUB4:
+
+	KERNEL1x8_SUBI1
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+
+	b		DTRMM_L1x8_SUB1
+
+DTRMM_L1x8_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL1x8_SUBI1
+
+	addic.		L,	L,	-1
+	ble		DTRMM_L1x8_SAVE
+	b		DTRMM_L1x8_SUB2
+
+DTRMM_L1x8_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		DTRMM_L1x8_SAVE
+
+DTRMM_L1x8_SUB2:
+
+	KERNEL1x8_SUB1
+
+	addic.		L,	L,	-1
+	bgt		DTRMM_L1x8_SUB2
+
+DTRMM_L1x8_SAVE:
+
+	SAVE1x8
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	3			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	6			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	8				// KK += Number of values in A
+#endif
+
+
+DTRMM_L1x8_END:
+
+DTRMM_L1x4_BEGIN:
+
+	andi.		T1,	M,	4
+	ble		DTRMM_L1x4_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	3				// Number of values in B shifted
+	slwi		T2,	KK,	5				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	4				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	1				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		DTRMM_L1x4_SUB0
+	cmpwi		cr0,	L,	1
+	ble		DTRMM_L1x4_SUB4
+
+DTRMM_L1x4_LOOP_START:
+
+	LOAD1x4_1
+	KERNEL1x4_I1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	addic.		L,	L,	-2
+	ble		DTRMM_L1x4_LOOP_END
+
+	.align 5
+
+DTRMM_L1x4_LOOP:
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	addic.		L,	L,	-1
+	bgt		DTRMM_L1x4_LOOP
+
+DTRMM_L1x4_LOOP_END:
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_E2
+
+	b		DTRMM_L1x4_SUB1
+
+DTRMM_L1x4_SUB4:
+
+	KERNEL1x4_SUBI1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+
+	b		DTRMM_L1x4_SUB1
+
+DTRMM_L1x4_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL1x4_SUBI1
+
+	addic.		L,	L,	-1
+	ble		DTRMM_L1x4_SAVE
+	b		DTRMM_L1x4_SUB2
+
+DTRMM_L1x4_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		DTRMM_L1x4_SAVE
+
+DTRMM_L1x4_SUB2:
+
+	KERNEL1x4_SUB1
+
+	addic.		L,	L,	-1
+	bgt		DTRMM_L1x4_SUB2
+
+DTRMM_L1x4_SAVE:
+
+	SAVE1x4
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	3			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	5			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	4				// KK += Number of values in A
+#endif
+
+
+DTRMM_L1x4_END:
+
+DTRMM_L1x2_BEGIN:
+
+	andi.		T1,	M,	2
+	ble		DTRMM_L1x2_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	3				// Number of values in B shifted
+	slwi		T2,	KK,	4				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	2				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	1				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		DTRMM_L1x2_SUB0
+	cmpwi		cr0,	L,	1
+	ble		DTRMM_L1x2_SUB4
+
+DTRMM_L1x2_LOOP_START:
+
+	LOAD1x2_1
+	KERNEL1x2_I1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	addic.		L,	L,	-2
+	ble		DTRMM_L1x2_LOOP_END
+
+	.align 5
+
+DTRMM_L1x2_LOOP:
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	addic.		L,	L,	-1
+	bgt		DTRMM_L1x2_LOOP
+
+DTRMM_L1x2_LOOP_END:
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_E2
+
+	b		DTRMM_L1x2_SUB1
+
+DTRMM_L1x2_SUB4:
+
+	KERNEL1x2_SUBI1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+
+	b		DTRMM_L1x2_SUB1
+
+DTRMM_L1x2_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL1x2_SUBI1
+
+	addic.		L,	L,	-1
+	ble		DTRMM_L1x2_SAVE
+	b		DTRMM_L1x2_SUB2
+
+DTRMM_L1x2_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		DTRMM_L1x2_SAVE
+
+DTRMM_L1x2_SUB2:
+
+	KERNEL1x2_SUB1
+
+	addic.		L,	L,	-1
+	bgt		DTRMM_L1x2_SUB2
+
+DTRMM_L1x2_SAVE:
+
+	SAVE1x2
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	3			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	4			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	2				// KK += Number of values in A
+#endif
+
+
+DTRMM_L1x2_END:
+
+DTRMM_L1x1_BEGIN:
+
+	andi.		T1,	M,	1
+	ble		DTRMM_L1x1_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	3				// Number of values in B shifted
+	slwi		T2,	KK,	3				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	1				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	1				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		DTRMM_L1x1_SUB0
+	cmpwi		cr0,	L,	1
+	ble		DTRMM_L1x1_SUB4
+
+DTRMM_L1x1_LOOP_START:
+
+	LOAD1x1_1
+	KERNEL1x1_I1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	addic.		L,	L,	-2
+	ble		DTRMM_L1x1_LOOP_END
+
+	.align 5
+
+DTRMM_L1x1_LOOP:
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	addic.		L,	L,	-1
+	bgt		DTRMM_L1x1_LOOP
+
+DTRMM_L1x1_LOOP_END:
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_E2
+
+	b		DTRMM_L1x1_SUB1
+
+DTRMM_L1x1_SUB4:
+
+	KERNEL1x1_SUBI1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+
+	b		DTRMM_L1x1_SUB1
+
+DTRMM_L1x1_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL1x1_SUBI1
+
+	addic.		L,	L,	-1
+	ble		DTRMM_L1x1_SAVE
+	b		DTRMM_L1x1_SUB2
+
+DTRMM_L1x1_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		DTRMM_L1x1_SAVE
+
+DTRMM_L1x1_SUB2:
+
+	KERNEL1x1_SUB1
+
+	addic.		L,	L,	-1
+	bgt		DTRMM_L1x1_SUB2
+
+DTRMM_L1x1_SAVE:
+
+	SAVE1x1
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	3			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	3			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	1				// KK += Number of values in A
+#endif
+
+
+DTRMM_L1x1_END:
+
+#if !defined(LEFT)
+	addi		KK,	KK,	1					// KK += Number of values in B
+#endif
+
+
+DTRMM_L1_END:
diff --git a/kernel/power/gemm_ncopy_4.S b/kernel/power/gemm_ncopy_4.S
index d7cfe5e..c6e69b4 100644
--- a/kernel/power/gemm_ncopy_4.S
+++ b/kernel/power/gemm_ncopy_4.S
@@ -104,12 +104,12 @@
 #define PREFETCHWSIZE  72
 #endif
 
-#ifdef POWER8
+#ifdef PPCG4
 #define PREFETCHSIZE   16
 #define PREFETCHWSIZE  72
 #endif
 
-#ifdef PPCG4
+#ifdef POWER8
 #define PREFETCHSIZE   16
 #define PREFETCHWSIZE  72
 #endif
@@ -198,7 +198,7 @@ LL(12):
 	STFD	c12,  14 * SIZE(B)
 	STFD	c16,  15 * SIZE(B)
 
-#ifdef POWER6
+#if defined(POWER6) || defined(POWER8)
 	dcbtst	PREA, AO1
 	dcbtst	PREA, AO2
 	dcbtst	PREA, AO3
diff --git a/kernel/power/gemm_tcopy_4.S b/kernel/power/gemm_tcopy_4.S
index 46b1cd9..3051344 100644
--- a/kernel/power/gemm_tcopy_4.S
+++ b/kernel/power/gemm_tcopy_4.S
@@ -108,12 +108,12 @@
 #define PREFETCHWSIZE  48
 #endif
 
-#ifdef POWER8
+#ifdef PPCG4
 #define PREFETCHSIZE   16
 #define PREFETCHWSIZE  48
 #endif
 
-#ifdef PPCG4
+#ifdef POWER8
 #define PREFETCHSIZE   16
 #define PREFETCHWSIZE  48
 #endif
@@ -229,7 +229,7 @@ LL(12):
 	STFD	c15,  14 * SIZE(B1)
 	STFD	c16,  15 * SIZE(B1)
 
-#ifdef POWER6
+#if defined(POWER6) || defined(POWER8)
 	dcbtst	PREA, AO1
 	dcbtst	PREA, AO2
 	dcbtst	PREA, AO3
diff --git a/kernel/power/gemv_n.S b/kernel/power/gemv_n.S
index 5c46c43..77587ec 100644
--- a/kernel/power/gemv_n.S
+++ b/kernel/power/gemv_n.S
@@ -174,11 +174,6 @@
 #define PREFETCHSIZE_C  40
 #endif
 
-#ifdef POWER8
-#define PREFETCHSIZE_A  96
-#define PREFETCHSIZE_C  40
-#endif
-
 #ifndef NEEDPARAM
 
 #ifndef __64BIT__
diff --git a/kernel/power/gemv_t.S b/kernel/power/gemv_t.S
index 4577530..817a60b 100644
--- a/kernel/power/gemv_t.S
+++ b/kernel/power/gemv_t.S
@@ -139,11 +139,6 @@
 #define PREFETCHSIZE_C   8
 #endif
 
-#ifdef POWER8
-#define PREFETCHSIZE_A  96
-#define PREFETCHSIZE_C   8
-#endif
-
 #define y01 f0
 #define y02 f1
 #define y03 f2
diff --git a/kernel/power/symv_L.S b/kernel/power/symv_L.S
index 9f759c3..f7d768c 100644
--- a/kernel/power/symv_L.S
+++ b/kernel/power/symv_L.S
@@ -168,11 +168,7 @@
 #define PREFETCHSIZE_A  40
 #endif
 
-#ifdef POWER8
-#define PREFETCHSIZE_A  40
-#endif
-
-#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970) || defined(POWER8)
+#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970)
 #define NOP1
 #define NOP2
 #else
diff --git a/kernel/power/symv_U.S b/kernel/power/symv_U.S
index e4e419b..d8e0823 100644
--- a/kernel/power/symv_U.S
+++ b/kernel/power/symv_U.S
@@ -167,11 +167,7 @@
 #define PREFETCHSIZE_A  40
 #endif
 
-#ifdef POWER8
-#define PREFETCHSIZE_A  40
-#endif
-
-#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970) || defined(POWER8)
+#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970)
 #define NOP1
 #define NOP2
 #else
diff --git a/kernel/power/zgemm_kernel_8x2_power8.S b/kernel/power/zgemm_kernel_8x2_power8.S
new file mode 100644
index 0000000..03957f4
--- /dev/null
+++ b/kernel/power/zgemm_kernel_8x2_power8.S
@@ -0,0 +1,332 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+#ifndef __64BIT__
+#define LOAD	lwz
+#else
+#define LOAD	ld
+#endif
+
+#ifdef __64BIT__
+#define STACKSIZE 320
+#define ALPHA_R_SP 296(SP)
+#define ALPHA_I_SP 304(SP)
+#define FZERO	312(SP)
+#else
+#define STACKSIZE 256
+#define ALPHA_R_SP 224(SP)
+#define ALPHA_I_SP 232(SP)
+#define FZERO	240(SP)
+#endif
+
+#define	M	r3
+#define	N	r4
+#define	K	r5
+
+#ifdef linux
+#ifndef __64BIT__
+#define A	r6
+#define	B	r7
+#define	C	r8
+#define	LDC	r9
+#define OFFSET	r10
+#else
+#define A	r8
+#define	B	r9
+#define	C	r10
+#define	LDC	r6
+#define OFFSET	r7
+#endif
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#if !defined(__64BIT__) && defined(DOUBLE)
+#define A	r10
+#define	B	r6
+#define	C	r7
+#define	LDC	r8
+#define OFFSET	r9
+#else
+#define A	r8
+#define	B	r9
+#define	C	r10
+#define	LDC	r6
+#define OFFSET	r7
+#endif
+#endif
+
+#define o0	0
+#define alpha_r vs30
+#define alpha_i vs31
+
+#define L	r15
+#define ALPHA	r16
+#define o24	r17
+#define T2	r19
+#define KK	r20
+#define	o8	r21
+#define	I	r22
+#define J	r23
+#define AO	r24
+#define	BO	r25
+#define	CO	r26
+#define o16	r27
+#define	o32	r28
+#define o48	r29
+
+#define PRE	r30
+#define T1  	r31
+
+#ifndef NEEDPARAM
+
+	PROLOGUE
+	PROFCODE
+
+	addi	SP, SP, -STACKSIZE
+	li	r0, 0
+
+	stfd	f14,    0(SP)
+	stfd	f15,    8(SP)
+	stfd	f16,   16(SP)
+	stfd	f17,   24(SP)
+
+	stfd	f18,   32(SP)
+	stfd	f19,   40(SP)
+	stfd	f20,   48(SP)
+	stfd	f21,   56(SP)
+
+	stfd	f22,   64(SP)
+	stfd	f23,   72(SP)
+	stfd	f24,   80(SP)
+	stfd	f25,   88(SP)
+
+	stfd	f26,   96(SP)
+	stfd	f27,  104(SP)
+	stfd	f28,  112(SP)
+	stfd	f29,  120(SP)
+
+	stfd	f30,  128(SP)
+	stfd	f31,  136(SP)
+
+#ifdef __64BIT__
+	std	r31,  144(SP)
+	std	r30,  152(SP)
+	std	r29,  160(SP)
+	std	r28,  168(SP)
+	std	r27,  176(SP)
+	std	r26,  184(SP)
+	std	r25,  192(SP)
+	std	r24,  200(SP)
+	std	r23,  208(SP)
+	std	r22,  216(SP)
+	std	r21,  224(SP)
+	std	r20,  232(SP)
+	std	r19,  240(SP)
+	std	r18,  248(SP)
+	std	r17,  256(SP)
+	std	r16,  264(SP)
+	std	r15,  272(SP)
+#else
+	stw	r31,  144(SP)
+	stw	r30,  148(SP)
+	stw	r29,  152(SP)
+	stw	r28,  156(SP)
+	stw	r27,  160(SP)
+	stw	r26,  164(SP)
+	stw	r25,  168(SP)
+	stw	r24,  172(SP)
+	stw	r23,  176(SP)
+	stw	r22,  180(SP)
+	stw	r21,  184(SP)
+	stw	r20,  188(SP)
+	stw	r19,  192(SP)
+	stw	r18,  196(SP)
+	stw	r17,  200(SP)
+	stw	r16,  204(SP)
+	stw	r15,  208(SP)
+#endif
+
+	stfd	f1,  ALPHA_R_SP
+	stfd	f2,  ALPHA_I_SP
+	stw	r0,  FZERO
+
+#ifdef linux
+#ifdef __64BIT__
+	ld	LDC, FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#ifdef __64BIT__
+	ld	LDC, FRAMESLOT(0) + STACKSIZE(SP)
+#else
+#ifdef DOUBLE
+	lwz	B,   FRAMESLOT(0) + STACKSIZE(SP)
+	lwz	C,   FRAMESLOT(1) + STACKSIZE(SP)
+	lwz	LDC, FRAMESLOT(2) + STACKSIZE(SP)
+#else
+	lwz	LDC, FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+#endif
+#endif
+
+#ifdef TRMMKERNEL
+#if defined(linux) && defined(__64BIT__)
+	ld	OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#ifdef __64BIT__
+	ld	OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
+#else
+#ifdef DOUBLE
+	lwz	OFFSET,  FRAMESLOT(3) + STACKSIZE(SP)
+#else
+	lwz	OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
+#endif
+#endif
+#endif
+#if defined(TRMMKERNEL) && !defined(LEFT)
+	neg	KK, OFFSET
+#endif
+#endif
+
+#include "zgemm_macros_8x2_power8.S"
+
+	cmpwi	cr0, M, 0
+	ble	L999
+	cmpwi	cr0, N, 0
+	ble	L999
+	cmpwi	cr0, K, 0
+	ble	L999
+
+	slwi	LDC, LDC, ZBASE_SHIFT
+	li	PRE, 256 
+	li	o8  , 8
+	li	o16 , 16
+	li	o24 , 24
+	li	o32 , 32
+	li	o48 , 48
+
+#ifdef __64BIT__
+	addi	ALPHA, SP, 296
+#else
+	addi	ALPHA, SP, 224
+#endif
+
+	lxvdsx	alpha_r, 0, ALPHA
+	lxvdsx	alpha_i, o8, ALPHA
+
+	.align 5
+
+#include "zgemm_logic_8x2_power8.S"
+
+L999:
+	addi	r3, 0, 0
+
+	lfd	f14,    0(SP)
+	lfd	f15,    8(SP)
+	lfd	f16,   16(SP)
+	lfd	f17,   24(SP)
+
+	lfd	f18,   32(SP)
+	lfd	f19,   40(SP)
+	lfd	f20,   48(SP)
+	lfd	f21,   56(SP)
+
+	lfd	f22,   64(SP)
+	lfd	f23,   72(SP)
+	lfd	f24,   80(SP)
+	lfd	f25,   88(SP)
+
+	lfd	f26,   96(SP)
+	lfd	f27,  104(SP)
+	lfd	f28,  112(SP)
+	lfd	f29,  120(SP)
+
+	lfd	f30,  128(SP)
+	lfd	f31,  136(SP)
+
+#ifdef __64BIT__
+	ld	r31,  144(SP)
+	ld	r30,  152(SP)
+	ld	r29,  160(SP)
+	ld	r28,  168(SP)
+	ld	r27,  176(SP)
+	ld	r26,  184(SP)
+	ld	r25,  192(SP)
+	ld	r24,  200(SP)
+	ld	r23,  208(SP)
+	ld	r22,  216(SP)
+	ld	r21,  224(SP)
+	ld	r20,  232(SP)
+	ld	r19,  240(SP)
+	ld	r18,  248(SP)
+	ld	r17,  256(SP)
+	ld	r16,  264(SP)
+	ld	r15,  272(SP)
+#else
+	lwz	r31,  144(SP)
+	lwz	r30,  148(SP)
+	lwz	r29,  152(SP)
+	lwz	r28,  156(SP)
+	lwz	r27,  160(SP)
+	lwz	r26,  164(SP)
+	lwz	r25,  168(SP)
+	lwz	r24,  172(SP)
+	lwz	r23,  176(SP)
+	lwz	r22,  180(SP)
+	lwz	r21,  184(SP)
+	lwz	r20,  188(SP)
+	lwz	r19,  192(SP)
+	lwz	r18,  196(SP)
+	lwz	r17,  200(SP)
+	lwz	r16,  204(SP)
+	lwz	r15,  208(SP)
+#endif
+
+	addi	SP, SP, STACKSIZE
+
+	blr
+
+	EPILOGUE
+#endif
diff --git a/kernel/power/zgemm_logic_8x2_power8.S b/kernel/power/zgemm_logic_8x2_power8.S
new file mode 100644
index 0000000..e829fd6
--- /dev/null
+++ b/kernel/power/zgemm_logic_8x2_power8.S
@@ -0,0 +1,901 @@
+	srawi.		J,	N,	1
+	ble		ZGEMM_L2_END
+
+ZGEMM_L2_BEGIN:
+
+	mr		CO,	C
+	mr		AO,	A
+	slwi		T1,	LDC	,	1
+	add		C,	C,	T1
+	srawi.		I,	M,	3
+	ble		ZGEMM_L2x8_END
+
+ZGEMM_L2x8_BEGIN:
+
+
+	mr		BO,	B
+	srawi.		L,	K,	3
+	ble		ZGEMM_L2x8_SUB0
+	cmpwi		cr0,	L,	1
+	ble		ZGEMM_L2x8_SUB4
+
+ZGEMM_L2x8_LOOP_START:
+
+	dcbt		AO,	PRE
+	LOAD2x8_1
+	dcbt		AO,	PRE
+	KERNEL2x8_I1
+	dcbt		AO,	PRE
+	KERNEL2x8_2
+	dcbt		AO,	PRE
+	KERNEL2x8_1
+	dcbt		AO,	PRE
+	KERNEL2x8_2
+
+	dcbt		AO,	PRE
+	KERNEL2x8_1
+	dcbt		AO,	PRE
+	KERNEL2x8_2
+	dcbt		AO,	PRE
+	KERNEL2x8_1
+	dcbt		AO,	PRE
+	KERNEL2x8_2
+
+	addic.		L,	L,	-2
+	ble		ZGEMM_L2x8_LOOP_END
+
+	.align 5
+
+ZGEMM_L2x8_LOOP:
+
+	dcbt		AO,	PRE
+	KERNEL2x8_1
+	dcbt		AO,	PRE
+	KERNEL2x8_2
+	dcbt		AO,	PRE
+	KERNEL2x8_1
+	dcbt		AO,	PRE
+	KERNEL2x8_2
+
+	dcbt		AO,	PRE
+	KERNEL2x8_1
+	dcbt		AO,	PRE
+	KERNEL2x8_2
+	dcbt		AO,	PRE
+	KERNEL2x8_1
+	dcbt		AO,	PRE
+	KERNEL2x8_2
+
+	addic.		L,	L,	-1
+	bgt		ZGEMM_L2x8_LOOP
+
+ZGEMM_L2x8_LOOP_END:
+
+	dcbt		AO,	PRE
+	KERNEL2x8_1
+	dcbt		AO,	PRE
+	KERNEL2x8_2
+	dcbt		AO,	PRE
+	KERNEL2x8_1
+	dcbt		AO,	PRE
+	KERNEL2x8_2
+
+	dcbt		AO,	PRE
+	KERNEL2x8_1
+	dcbt		AO,	PRE
+	KERNEL2x8_2
+	dcbt		AO,	PRE
+	KERNEL2x8_1
+	KERNEL2x8_E2
+
+	b		ZGEMM_L2x8_SUB1
+
+ZGEMM_L2x8_SUB4:
+
+	dcbt		AO,	PRE
+	KERNEL2x8_SUBI1
+	dcbt		AO,	PRE
+	KERNEL2x8_SUB1
+	dcbt		AO,	PRE
+	KERNEL2x8_SUB1
+	dcbt		AO,	PRE
+	KERNEL2x8_SUB1
+
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+
+	b		ZGEMM_L2x8_SUB1
+
+ZGEMM_L2x8_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL2x8_SUBI1
+
+	addic.		L,	L,	-1
+	ble		ZGEMM_L2x8_SAVE
+	b		ZGEMM_L2x8_SUB2
+
+ZGEMM_L2x8_SUB1:
+
+	andi.		L,	K,	7
+	ble		ZGEMM_L2x8_SAVE
+
+ZGEMM_L2x8_SUB2:
+
+	KERNEL2x8_SUB1
+
+	addic.		L,	L,	-1
+	bgt		ZGEMM_L2x8_SUB2
+
+ZGEMM_L2x8_SAVE:
+
+	SAVE2x8
+
+	addic.		I,	I,	-1
+	bgt		ZGEMM_L2x8_BEGIN
+
+ZGEMM_L2x8_END:
+
+ZGEMM_L2x4_BEGIN:
+
+	andi.		T2,	M,	7
+	ble		ZGEMM_L2x1_END
+
+	andi.		T1,	M,	4
+	ble		ZGEMM_L2x4_END
+	mr		BO,	B
+	srawi.		L,	K,	3
+	ble		ZGEMM_L2x4_SUB0
+	cmpwi		cr0,	L,	1
+	ble		ZGEMM_L2x4_SUB4
+
+ZGEMM_L2x4_LOOP_START:
+
+	LOAD2x4_1
+	KERNEL2x4_I1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	addic.		L,	L,	-2
+	ble		ZGEMM_L2x4_LOOP_END
+
+	.align 5
+
+ZGEMM_L2x4_LOOP:
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	addic.		L,	L,	-1
+	bgt		ZGEMM_L2x4_LOOP
+
+ZGEMM_L2x4_LOOP_END:
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_E2
+
+	b		ZGEMM_L2x4_SUB1
+
+ZGEMM_L2x4_SUB4:
+
+	KERNEL2x4_SUBI1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+
+	b		ZGEMM_L2x4_SUB1
+
+ZGEMM_L2x4_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL2x4_SUBI1
+
+	addic.		L,	L,	-1
+	ble		ZGEMM_L2x4_SAVE
+	b		ZGEMM_L2x4_SUB2
+
+ZGEMM_L2x4_SUB1:
+
+	andi.		L,	K,	7
+	ble		ZGEMM_L2x4_SAVE
+
+ZGEMM_L2x4_SUB2:
+
+	KERNEL2x4_SUB1
+
+	addic.		L,	L,	-1
+	bgt		ZGEMM_L2x4_SUB2
+
+ZGEMM_L2x4_SAVE:
+
+	SAVE2x4
+
+ZGEMM_L2x4_END:
+
+ZGEMM_L2x2_BEGIN:
+
+
+	andi.		T1,	M,	2
+	ble		ZGEMM_L2x2_END
+	mr		BO,	B
+	srawi.		L,	K,	3
+	ble		ZGEMM_L2x2_SUB0
+	cmpwi		cr0,	L,	1
+	ble		ZGEMM_L2x2_SUB4
+
+ZGEMM_L2x2_LOOP_START:
+
+	LOAD2x2_1
+	KERNEL2x2_I1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	addic.		L,	L,	-2
+	ble		ZGEMM_L2x2_LOOP_END
+
+	.align 5
+
+ZGEMM_L2x2_LOOP:
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	addic.		L,	L,	-1
+	bgt		ZGEMM_L2x2_LOOP
+
+ZGEMM_L2x2_LOOP_END:
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_E2
+
+	b		ZGEMM_L2x2_SUB1
+
+ZGEMM_L2x2_SUB4:
+
+	KERNEL2x2_SUBI1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+
+	b		ZGEMM_L2x2_SUB1
+
+ZGEMM_L2x2_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL2x2_SUBI1
+
+	addic.		L,	L,	-1
+	ble		ZGEMM_L2x2_SAVE
+	b		ZGEMM_L2x2_SUB2
+
+ZGEMM_L2x2_SUB1:
+
+	andi.		L,	K,	7
+	ble		ZGEMM_L2x2_SAVE
+
+ZGEMM_L2x2_SUB2:
+
+	KERNEL2x2_SUB1
+
+	addic.		L,	L,	-1
+	bgt		ZGEMM_L2x2_SUB2
+
+ZGEMM_L2x2_SAVE:
+
+	SAVE2x2
+
+ZGEMM_L2x2_END:
+
+ZGEMM_L2x1_BEGIN:
+
+
+	andi.		T1,	M,	1
+	ble		ZGEMM_L2x1_END
+	mr		BO,	B
+	srawi.		L,	K,	3
+	ble		ZGEMM_L2x1_SUB0
+	cmpwi		cr0,	L,	1
+	ble		ZGEMM_L2x1_SUB4
+
+ZGEMM_L2x1_LOOP_START:
+
+	LOAD2x1_1
+	KERNEL2x1_I1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	addic.		L,	L,	-2
+	ble		ZGEMM_L2x1_LOOP_END
+
+	.align 5
+
+ZGEMM_L2x1_LOOP:
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	addic.		L,	L,	-1
+	bgt		ZGEMM_L2x1_LOOP
+
+ZGEMM_L2x1_LOOP_END:
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_E2
+
+	b		ZGEMM_L2x1_SUB1
+
+ZGEMM_L2x1_SUB4:
+
+	KERNEL2x1_SUBI1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+
+	b		ZGEMM_L2x1_SUB1
+
+ZGEMM_L2x1_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL2x1_SUBI1
+
+	addic.		L,	L,	-1
+	ble		ZGEMM_L2x1_SAVE
+	b		ZGEMM_L2x1_SUB2
+
+ZGEMM_L2x1_SUB1:
+
+	andi.		L,	K,	7
+	ble		ZGEMM_L2x1_SAVE
+
+ZGEMM_L2x1_SUB2:
+
+	KERNEL2x1_SUB1
+
+	addic.		L,	L,	-1
+	bgt		ZGEMM_L2x1_SUB2
+
+ZGEMM_L2x1_SAVE:
+
+	SAVE2x1
+
+ZGEMM_L2x1_END:
+
+	slwi		T1,	K,	5
+	add		B,	B,	T1
+
+	addic.		J,	J,	-1
+	bgt		ZGEMM_L2_BEGIN
+
+	andi.		T2,	N,	1
+	ble		L999
+
+ZGEMM_L2_END:
+
+	b		ZGEMM_L1_BEGIN
+
+L999_H1:
+
+	b		L999
+
+ZGEMM_L1_BEGIN:
+
+	andi.		T1,	N,	1
+	ble		ZGEMM_L1_END
+	mr		CO,	C
+	mr		AO,	A
+	srawi.		I,	M,	3
+	ble		ZGEMM_L1x8_END
+
+ZGEMM_L1x8_BEGIN:
+
+
+	mr		BO,	B
+	srawi.		L,	K,	3
+	ble		ZGEMM_L1x8_SUB0
+	cmpwi		cr0,	L,	1
+	ble		ZGEMM_L1x8_SUB4
+
+ZGEMM_L1x8_LOOP_START:
+
+	dcbt		AO,	PRE
+	LOAD1x8_1
+	dcbt		AO,	PRE
+	KERNEL1x8_I1
+	dcbt		AO,	PRE
+	KERNEL1x8_2
+	dcbt		AO,	PRE
+	KERNEL1x8_1
+	dcbt		AO,	PRE
+	KERNEL1x8_2
+
+	dcbt		AO,	PRE
+	KERNEL1x8_1
+	dcbt		AO,	PRE
+	KERNEL1x8_2
+	dcbt		AO,	PRE
+	KERNEL1x8_1
+	dcbt		AO,	PRE
+	KERNEL1x8_2
+
+	addic.		L,	L,	-2
+	ble		ZGEMM_L1x8_LOOP_END
+
+	.align 5
+
+ZGEMM_L1x8_LOOP:
+
+	dcbt		AO,	PRE
+	KERNEL1x8_1
+	dcbt		AO,	PRE
+	KERNEL1x8_2
+	dcbt		AO,	PRE
+	KERNEL1x8_1
+	dcbt		AO,	PRE
+	KERNEL1x8_2
+
+	dcbt		AO,	PRE
+	KERNEL1x8_1
+	dcbt		AO,	PRE
+	KERNEL1x8_2
+	dcbt		AO,	PRE
+	KERNEL1x8_1
+	dcbt		AO,	PRE
+	KERNEL1x8_2
+
+	addic.		L,	L,	-1
+	bgt		ZGEMM_L1x8_LOOP
+
+ZGEMM_L1x8_LOOP_END:
+
+	dcbt		AO,	PRE
+	KERNEL1x8_1
+	dcbt		AO,	PRE
+	KERNEL1x8_2
+	dcbt		AO,	PRE
+	KERNEL1x8_1
+	dcbt		AO,	PRE
+	KERNEL1x8_2
+
+	dcbt		AO,	PRE
+	KERNEL1x8_1
+	dcbt		AO,	PRE
+	KERNEL1x8_2
+	dcbt		AO,	PRE
+	KERNEL1x8_1
+	KERNEL1x8_E2
+
+	b		ZGEMM_L1x8_SUB1
+
+ZGEMM_L1x8_SUB4:
+
+	dcbt		AO,	PRE
+	KERNEL1x8_SUBI1
+	dcbt		AO,	PRE
+	KERNEL1x8_SUB1
+	dcbt		AO,	PRE
+	KERNEL1x8_SUB1
+	dcbt		AO,	PRE
+	KERNEL1x8_SUB1
+
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+
+	b		ZGEMM_L1x8_SUB1
+
+ZGEMM_L1x8_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL1x8_SUBI1
+
+	addic.		L,	L,	-1
+	ble		ZGEMM_L1x8_SAVE
+	b		ZGEMM_L1x8_SUB2
+
+ZGEMM_L1x8_SUB1:
+
+	andi.		L,	K,	7
+	ble		ZGEMM_L1x8_SAVE
+
+ZGEMM_L1x8_SUB2:
+
+	KERNEL1x8_SUB1
+
+	addic.		L,	L,	-1
+	bgt		ZGEMM_L1x8_SUB2
+
+ZGEMM_L1x8_SAVE:
+
+	SAVE1x8
+
+	addic.		I,	I,	-1
+	bgt		ZGEMM_L1x8_BEGIN
+
+ZGEMM_L1x8_END:
+
+ZGEMM_L1x4_BEGIN:
+
+	andi.		T2,	M,	7
+	ble		ZGEMM_L1x1_END
+
+	andi.		T1,	M,	4
+	ble		ZGEMM_L1x4_END
+	mr		BO,	B
+	srawi.		L,	K,	3
+	ble		ZGEMM_L1x4_SUB0
+	cmpwi		cr0,	L,	1
+	ble		ZGEMM_L1x4_SUB4
+
+ZGEMM_L1x4_LOOP_START:
+
+	LOAD1x4_1
+	KERNEL1x4_I1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	addic.		L,	L,	-2
+	ble		ZGEMM_L1x4_LOOP_END
+
+	.align 5
+
+ZGEMM_L1x4_LOOP:
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	addic.		L,	L,	-1
+	bgt		ZGEMM_L1x4_LOOP
+
+ZGEMM_L1x4_LOOP_END:
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_E2
+
+	b		ZGEMM_L1x4_SUB1
+
+ZGEMM_L1x4_SUB4:
+
+	KERNEL1x4_SUBI1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+
+	b		ZGEMM_L1x4_SUB1
+
+ZGEMM_L1x4_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL1x4_SUBI1
+
+	addic.		L,	L,	-1
+	ble		ZGEMM_L1x4_SAVE
+	b		ZGEMM_L1x4_SUB2
+
+ZGEMM_L1x4_SUB1:
+
+	andi.		L,	K,	7
+	ble		ZGEMM_L1x4_SAVE
+
+ZGEMM_L1x4_SUB2:
+
+	KERNEL1x4_SUB1
+
+	addic.		L,	L,	-1
+	bgt		ZGEMM_L1x4_SUB2
+
+ZGEMM_L1x4_SAVE:
+
+	SAVE1x4
+
+ZGEMM_L1x4_END:
+
+ZGEMM_L1x2_BEGIN:
+
+
+	andi.		T1,	M,	2
+	ble		ZGEMM_L1x2_END
+	mr		BO,	B
+	srawi.		L,	K,	3
+	ble		ZGEMM_L1x2_SUB0
+	cmpwi		cr0,	L,	1
+	ble		ZGEMM_L1x2_SUB4
+
+ZGEMM_L1x2_LOOP_START:
+
+	LOAD1x2_1
+	KERNEL1x2_I1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	addic.		L,	L,	-2
+	ble		ZGEMM_L1x2_LOOP_END
+
+	.align 5
+
+ZGEMM_L1x2_LOOP:
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	addic.		L,	L,	-1
+	bgt		ZGEMM_L1x2_LOOP
+
+ZGEMM_L1x2_LOOP_END:
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_E2
+
+	b		ZGEMM_L1x2_SUB1
+
+ZGEMM_L1x2_SUB4:
+
+	KERNEL1x2_SUBI1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+
+	b		ZGEMM_L1x2_SUB1
+
+ZGEMM_L1x2_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL1x2_SUBI1
+
+	addic.		L,	L,	-1
+	ble		ZGEMM_L1x2_SAVE
+	b		ZGEMM_L1x2_SUB2
+
+ZGEMM_L1x2_SUB1:
+
+	andi.		L,	K,	7
+	ble		ZGEMM_L1x2_SAVE
+
+ZGEMM_L1x2_SUB2:
+
+	KERNEL1x2_SUB1
+
+	addic.		L,	L,	-1
+	bgt		ZGEMM_L1x2_SUB2
+
+ZGEMM_L1x2_SAVE:
+
+	SAVE1x2
+
+ZGEMM_L1x2_END:
+
+ZGEMM_L1x1_BEGIN:
+
+
+	andi.		T1,	M,	1
+	ble		ZGEMM_L1x1_END
+	mr		BO,	B
+	srawi.		L,	K,	3
+	ble		ZGEMM_L1x1_SUB0
+	cmpwi		cr0,	L,	1
+	ble		ZGEMM_L1x1_SUB4
+
+ZGEMM_L1x1_LOOP_START:
+
+	LOAD1x1_1
+	KERNEL1x1_I1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	addic.		L,	L,	-2
+	ble		ZGEMM_L1x1_LOOP_END
+
+	.align 5
+
+ZGEMM_L1x1_LOOP:
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	addic.		L,	L,	-1
+	bgt		ZGEMM_L1x1_LOOP
+
+ZGEMM_L1x1_LOOP_END:
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_E2
+
+	b		ZGEMM_L1x1_SUB1
+
+ZGEMM_L1x1_SUB4:
+
+	KERNEL1x1_SUBI1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+
+	b		ZGEMM_L1x1_SUB1
+
+ZGEMM_L1x1_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL1x1_SUBI1
+
+	addic.		L,	L,	-1
+	ble		ZGEMM_L1x1_SAVE
+	b		ZGEMM_L1x1_SUB2
+
+ZGEMM_L1x1_SUB1:
+
+	andi.		L,	K,	7
+	ble		ZGEMM_L1x1_SAVE
+
+ZGEMM_L1x1_SUB2:
+
+	KERNEL1x1_SUB1
+
+	addic.		L,	L,	-1
+	bgt		ZGEMM_L1x1_SUB2
+
+ZGEMM_L1x1_SAVE:
+
+	SAVE1x1
+
+ZGEMM_L1x1_END:
+
+ZGEMM_L1_END:
diff --git a/kernel/power/zgemm_macros_8x2_power8.S b/kernel/power/zgemm_macros_8x2_power8.S
new file mode 100644
index 0000000..3e5ea9c
--- /dev/null
+++ b/kernel/power/zgemm_macros_8x2_power8.S
@@ -0,0 +1,3074 @@
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+
+	#define	XSFADD_R1	xsadddp
+	#define	XSFADD_R2	xssubdp
+	#define	XSFADD_I1	xsadddp
+	#define	XSFADD_I2	xsadddp
+
+#elif  defined(CN) || defined(CT) || defined(RN) || defined(RT)
+
+	#define	XSFADD_R1	xsadddp
+	#define	XSFADD_R2	xsadddp
+	#define	XSFADD_I1	xssubdp
+	#define	XSFADD_I2	xsadddp
+
+#elif  defined(NC) || defined(TC) || defined(NR) || defined(TR)
+
+	#define	XSFADD_R1	xsadddp
+	#define	XSFADD_R2	xsadddp
+	#define	XSFADD_I1	xsadddp
+	#define	XSFADD_I2	xssubdp
+
+#else		// CC || CR || RC || RR
+
+	#define	XSFADD_R1	xsadddp
+	#define	XSFADD_R2	xssubdp
+	#define	XSFADD_I1	xssubdp
+	#define	XSFADD_I2	xssubdp
+
+#endif
+
+/**********************************************************************************************
+* Macros for N=2 and M=8
+**********************************************************************************************/
+
+.macro LOAD2x8_1
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvd2x		vs4,	o0,	AO		// load real,imag from A
+	lxvd2x		vs5,	o16,	AO		// load real,imag from A
+	lxvd2x		vs6,	o32,	AO		// load real,imag from A
+	lxvd2x		vs7,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+
+.endm
+
+.macro KERNEL2x8_I1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+	lxvd2x		vs9,	o16,	AO		// load real,imag from A
+	lxvd2x		vs10,	o32,	AO		// load real,imag from A
+	lxvd2x		vs11,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvd2x		vs12,	o0,	AO		// load real,imag from A
+	lxvd2x		vs13,	o16,	AO		// load real,imag from A
+	lxvd2x		vs14,	o32,	AO		// load real,imag from A
+	lxvd2x		vs15,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+	lxvdsx		vs22,	o16,	BO		// load real part from B
+	lxvdsx		vs23,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
+	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmuldp		vs36,	vs2,	vs16		// real*real, imag*real
+	xvmuldp		vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmuldp		vs38,	vs3,	vs16		// real*real, imag*real
+	xvmuldp		vs39,	vs3,	vs17		// real*imag, imag*imag
+	xvmuldp		vs40,	vs4,	vs16		// real*real, imag*real
+	xvmuldp		vs41,	vs4,	vs17		// real*imag, imag*imag
+	xvmuldp		vs42,	vs5,	vs16		// real*real, imag*real
+	xvmuldp		vs43,	vs5,	vs17		// real*imag, imag*imag
+	xvmuldp		vs44,	vs6,	vs16		// real*real, imag*real
+	xvmuldp		vs45,	vs6,	vs17		// real*imag, imag*imag
+	xvmuldp		vs46,	vs7,	vs16		// real*real, imag*real
+	xvmuldp		vs47,	vs7,	vs17		// real*imag, imag*imag
+
+	xvmuldp		vs48,	vs0,	vs18		// real*real, imag*real
+	xvmuldp		vs49,	vs0,	vs19		// real*imag, imag*imag
+	xvmuldp		vs50,	vs1,	vs18		// real*real, imag*real
+	xvmuldp		vs51,	vs1,	vs19		// real*imag, imag*imag
+	xvmuldp		vs52,	vs2,	vs18		// real*real, imag*real
+	xvmuldp		vs53,	vs2,	vs19		// real*imag, imag*imag
+	xvmuldp		vs54,	vs3,	vs18		// real*real, imag*real
+	xvmuldp		vs55,	vs3,	vs19		// real*imag, imag*imag
+	xvmuldp		vs56,	vs4,	vs18		// real*real, imag*real
+	xvmuldp		vs57,	vs4,	vs19		// real*imag, imag*imag
+	xvmuldp		vs58,	vs5,	vs18		// real*real, imag*real
+	xvmuldp		vs59,	vs5,	vs19		// real*imag, imag*imag
+	xvmuldp		vs60,	vs6,	vs18		// real*real, imag*real
+	xvmuldp		vs61,	vs6,	vs19		// real*imag, imag*imag
+	xvmuldp		vs62,	vs7,	vs18		// real*real, imag*real
+	xvmuldp		vs63,	vs7,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x8_1
+
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
+	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
+
+	lxvdsx		vs22,	o16,	BO		// load real part from B
+	lxvdsx		vs23,	o24,	BO		// load imag part from B
+
+	xvmaddadp	vs36,	vs2,	vs16		// real*real, imag*real
+	xvmaddadp	vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs3,	vs16		// real*real, imag*real
+	xvmaddadp	vs39,	vs3,	vs17		// real*imag, imag*imag
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+	lxvd2x		vs9,	o16,	AO		// load real,imag from A
+
+	xvmaddadp	vs40,	vs4,	vs16		// real*real, imag*real
+	xvmaddadp	vs41,	vs4,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs42,	vs5,	vs16		// real*real, imag*real
+	xvmaddadp	vs43,	vs5,	vs17		// real*imag, imag*imag
+
+	lxvd2x		vs10,	o32,	AO		// load real,imag from A
+	lxvd2x		vs11,	o48,	AO		// load real,imag from A
+
+	xvmaddadp	vs44,	vs6,	vs16		// real*real, imag*real
+	xvmaddadp	vs45,	vs6,	vs17		// real*imag, imag*imag
+
+	addi		AO,	AO,	64
+
+	xvmaddadp	vs46,	vs7,	vs16		// real*real, imag*real
+	xvmaddadp	vs47,	vs7,	vs17		// real*imag, imag*imag
+
+	xvmaddadp	vs48,	vs0,	vs18		// real*real, imag*real
+	xvmaddadp	vs49,	vs0,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs50,	vs1,	vs18		// real*real, imag*real
+	xvmaddadp	vs51,	vs1,	vs19		// real*imag, imag*imag
+
+	lxvd2x		vs12,	o0,	AO		// load real,imag from A
+	lxvd2x		vs13,	o16,	AO		// load real,imag from A
+
+	xvmaddadp	vs52,	vs2,	vs18		// real*real, imag*real
+	xvmaddadp	vs53,	vs2,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs54,	vs3,	vs18		// real*real, imag*real
+	xvmaddadp	vs55,	vs3,	vs19		// real*imag, imag*imag
+
+	lxvd2x		vs14,	o32,	AO		// load real,imag from A
+	lxvd2x		vs15,	o48,	AO		// load real,imag from A
+
+	xvmaddadp	vs56,	vs4,	vs18		// real*real, imag*real
+	xvmaddadp	vs57,	vs4,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs58,	vs5,	vs18		// real*real, imag*real
+	xvmaddadp	vs59,	vs5,	vs19		// real*imag, imag*imag
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+
+	xvmaddadp	vs60,	vs6,	vs18		// real*real, imag*real
+	xvmaddadp	vs61,	vs6,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs62,	vs7,	vs18		// real*real, imag*real
+	xvmaddadp	vs63,	vs7,	vs19		// real*imag, imag*imag
+
+	addi		AO,	AO,	64
+	addi		BO,	BO,	32
+
+.endm
+
+.macro KERNEL2x8_2
+
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
+	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	xvmaddadp	vs36,	vs10,	vs20		// real*real, imag*real
+	xvmaddadp	vs37,	vs10,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs11,	vs20		// real*real, imag*real
+	xvmaddadp	vs39,	vs11,	vs21		// real*imag, imag*imag
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+
+	xvmaddadp	vs40,	vs12,	vs20		// real*real, imag*real
+	xvmaddadp	vs41,	vs12,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs42,	vs13,	vs20		// real*real, imag*real
+	xvmaddadp	vs43,	vs13,	vs21		// real*imag, imag*imag
+
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	xvmaddadp	vs44,	vs14,	vs20		// real*real, imag*real
+	xvmaddadp	vs45,	vs14,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs46,	vs15,	vs20		// real*real, imag*real
+	xvmaddadp	vs47,	vs15,	vs21		// real*imag, imag*imag
+
+	addi		AO,	AO,	64
+
+	xvmaddadp	vs48,	vs8,	vs22		// real*real, imag*real
+	xvmaddadp	vs49,	vs8,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs50,	vs9,	vs22		// real*real, imag*real
+	xvmaddadp	vs51,	vs9,	vs23		// real*imag, imag*imag
+
+	lxvd2x		vs4,	o0,	AO		// load real,imag from A
+	lxvd2x		vs5,	o16,	AO		// load real,imag from A
+
+	xvmaddadp	vs52,	vs10,	vs22		// real*real, imag*real
+	xvmaddadp	vs53,	vs10,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs54,	vs11,	vs22		// real*real, imag*real
+	xvmaddadp	vs55,	vs11,	vs23		// real*imag, imag*imag
+
+	lxvd2x		vs6,	o32,	AO		// load real,imag from A
+	lxvd2x		vs7,	o48,	AO		// load real,imag from A
+
+	xvmaddadp	vs56,	vs12,	vs22		// real*real, imag*real
+	xvmaddadp	vs57,	vs12,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs58,	vs13,	vs22		// real*real, imag*real
+	xvmaddadp	vs59,	vs13,	vs23		// real*imag, imag*imag
+
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	xvmaddadp	vs60,	vs14,	vs22		// real*real, imag*real
+	xvmaddadp	vs61,	vs14,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs62,	vs15,	vs22		// real*real, imag*real
+	xvmaddadp	vs63,	vs15,	vs23		// real*imag, imag*imag
+
+	addi		AO,	AO,	64
+	addi		BO,	BO,	32
+
+.endm
+
+.macro KERNEL2x8_E2
+
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
+	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs36,	vs10,	vs20		// real*real, imag*real
+	xvmaddadp	vs37,	vs10,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs11,	vs20		// real*real, imag*real
+	xvmaddadp	vs39,	vs11,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs40,	vs12,	vs20		// real*real, imag*real
+	xvmaddadp	vs41,	vs12,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs42,	vs13,	vs20		// real*real, imag*real
+	xvmaddadp	vs43,	vs13,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs44,	vs14,	vs20		// real*real, imag*real
+	xvmaddadp	vs45,	vs14,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs46,	vs15,	vs20		// real*real, imag*real
+	xvmaddadp	vs47,	vs15,	vs21		// real*imag, imag*imag
+
+	xvmaddadp	vs48,	vs8,	vs22		// real*real, imag*real
+	xvmaddadp	vs49,	vs8,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs50,	vs9,	vs22		// real*real, imag*real
+	xvmaddadp	vs51,	vs9,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs52,	vs10,	vs22		// real*real, imag*real
+	xvmaddadp	vs53,	vs10,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs54,	vs11,	vs22		// real*real, imag*real
+	xvmaddadp	vs55,	vs11,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs56,	vs12,	vs22		// real*real, imag*real
+	xvmaddadp	vs57,	vs12,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs58,	vs13,	vs22		// real*real, imag*real
+	xvmaddadp	vs59,	vs13,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs60,	vs14,	vs22		// real*real, imag*real
+	xvmaddadp	vs61,	vs14,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs62,	vs15,	vs22		// real*real, imag*real
+	xvmaddadp	vs63,	vs15,	vs23		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x8_SUBI1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvd2x		vs4,	o0,	AO		// load real,imag from A
+	lxvd2x		vs5,	o16,	AO		// load real,imag from A
+	lxvd2x		vs6,	o32,	AO		// load real,imag from A
+	lxvd2x		vs7,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
+	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmuldp		vs36,	vs2,	vs16		// real*real, imag*real
+	xvmuldp		vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmuldp		vs38,	vs3,	vs16		// real*real, imag*real
+	xvmuldp		vs39,	vs3,	vs17		// real*imag, imag*imag
+	xvmuldp		vs40,	vs4,	vs16		// real*real, imag*real
+	xvmuldp		vs41,	vs4,	vs17		// real*imag, imag*imag
+	xvmuldp		vs42,	vs5,	vs16		// real*real, imag*real
+	xvmuldp		vs43,	vs5,	vs17		// real*imag, imag*imag
+	xvmuldp		vs44,	vs6,	vs16		// real*real, imag*real
+	xvmuldp		vs45,	vs6,	vs17		// real*imag, imag*imag
+	xvmuldp		vs46,	vs7,	vs16		// real*real, imag*real
+	xvmuldp		vs47,	vs7,	vs17		// real*imag, imag*imag
+
+	xvmuldp		vs48,	vs0,	vs18		// real*real, imag*real
+	xvmuldp		vs49,	vs0,	vs19		// real*imag, imag*imag
+	xvmuldp		vs50,	vs1,	vs18		// real*real, imag*real
+	xvmuldp		vs51,	vs1,	vs19		// real*imag, imag*imag
+	xvmuldp		vs52,	vs2,	vs18		// real*real, imag*real
+	xvmuldp		vs53,	vs2,	vs19		// real*imag, imag*imag
+	xvmuldp		vs54,	vs3,	vs18		// real*real, imag*real
+	xvmuldp		vs55,	vs3,	vs19		// real*imag, imag*imag
+	xvmuldp		vs56,	vs4,	vs18		// real*real, imag*real
+	xvmuldp		vs57,	vs4,	vs19		// real*imag, imag*imag
+	xvmuldp		vs58,	vs5,	vs18		// real*real, imag*real
+	xvmuldp		vs59,	vs5,	vs19		// real*imag, imag*imag
+	xvmuldp		vs60,	vs6,	vs18		// real*real, imag*real
+	xvmuldp		vs61,	vs6,	vs19		// real*imag, imag*imag
+	xvmuldp		vs62,	vs7,	vs18		// real*real, imag*real
+	xvmuldp		vs63,	vs7,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x8_SUB1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvd2x		vs4,	o0,	AO		// load real,imag from A
+	lxvd2x		vs5,	o16,	AO		// load real,imag from A
+	lxvd2x		vs6,	o32,	AO		// load real,imag from A
+	lxvd2x		vs7,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
+	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs36,	vs2,	vs16		// real*real, imag*real
+	xvmaddadp	vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs3,	vs16		// real*real, imag*real
+	xvmaddadp	vs39,	vs3,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs40,	vs4,	vs16		// real*real, imag*real
+	xvmaddadp	vs41,	vs4,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs42,	vs5,	vs16		// real*real, imag*real
+	xvmaddadp	vs43,	vs5,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs44,	vs6,	vs16		// real*real, imag*real
+	xvmaddadp	vs45,	vs6,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs46,	vs7,	vs16		// real*real, imag*real
+	xvmaddadp	vs47,	vs7,	vs17		// real*imag, imag*imag
+
+	xvmaddadp	vs48,	vs0,	vs18		// real*real, imag*real
+	xvmaddadp	vs49,	vs0,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs50,	vs1,	vs18		// real*real, imag*real
+	xvmaddadp	vs51,	vs1,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs52,	vs2,	vs18		// real*real, imag*real
+	xvmaddadp	vs53,	vs2,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs54,	vs3,	vs18		// real*real, imag*real
+	xvmaddadp	vs55,	vs3,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs56,	vs4,	vs18		// real*real, imag*real
+	xvmaddadp	vs57,	vs4,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs58,	vs5,	vs18		// real*real, imag*real
+	xvmaddadp	vs59,	vs5,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs60,	vs6,	vs18		// real*real, imag*real
+	xvmaddadp	vs61,	vs6,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs62,	vs7,	vs18		// real*real, imag*real
+	xvmaddadp	vs63,	vs7,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE2x8
+
+
+	mr		T1,	CO
+	addi		T2,	T1,	64
+
+#ifndef TRMMKERNEL
+
+	lxvd2x		vs16,	o0,	T1
+	lxvd2x		vs17,	o16,	T1
+	lxvd2x		vs18,	o32,	T1
+	lxvd2x		vs19,	o48,	T1
+	lxvd2x		vs20,	o0,	T2
+	lxvd2x		vs21,	o16,	T2
+	lxvd2x		vs22,	o32,	T2
+	lxvd2x		vs23,	o48,	T2
+
+#endif
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs33,	vs33			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
+
+	xxswapd		vs32,	vs32			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs33,	vs33			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs35,	vs35			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs34		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs35		// imagA*imagB
+
+	xxswapd		vs34,	vs34			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs35,	vs35			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs34		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs35		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs9,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs37,	vs37			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs36		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs37		// imagA*imagB
+
+	xxswapd		vs36,	vs36			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs37,	vs37			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs36		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs37		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs10,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs39,	vs39			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs38		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs39		// imagA*imagB
+
+	xxswapd		vs38,	vs38			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs39,	vs39			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs38		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs39		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs11,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs41,	vs41			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs40		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs41		// imagA*imagB
+
+	xxswapd		vs40,	vs40			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs41,	vs41			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs40		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs41		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs12,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs43,	vs43			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs42		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs43		// imagA*imagB
+
+	xxswapd		vs42,	vs42			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs43,	vs43			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs42		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs43		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs13,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs45,	vs45			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs44		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs45		// imagA*imagB
+
+	xxswapd		vs44,	vs44			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs45,	vs45			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs44		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs45		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs14,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs47,	vs47			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs46		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs47		// imagA*imagB
+
+	xxswapd		vs46,	vs46			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs47,	vs47			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs46		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs47		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs15,	vs2,	vs3,	0	// merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+	xvadddp		vs8,	vs8,	vs16
+	xvadddp		vs9,	vs9,	vs17
+	xvadddp		vs10,	vs10,	vs18
+	xvadddp		vs11,	vs11,	vs19
+	xvadddp		vs12,	vs12,	vs20
+	xvadddp		vs13,	vs13,	vs21
+	xvadddp		vs14,	vs14,	vs22
+	xvadddp		vs15,	vs15,	vs23
+
+#endif
+
+	stxvd2x		vs8,	o0,	T1
+	stxvd2x		vs9,	o16,	T1
+	stxvd2x		vs10,	o32,	T1
+	stxvd2x		vs11,	o48,	T1
+	stxvd2x		vs12,	o0,	T2
+	stxvd2x		vs13,	o16,	T2
+	stxvd2x		vs14,	o32,	T2
+	stxvd2x		vs15,	o48,	T2
+
+	add		T1,	T1,	LDC
+	add		T2,	T2,	LDC
+
+#ifndef TRMMKERNEL
+
+	lxvd2x		vs16,	o0,	T1
+	lxvd2x		vs17,	o16,	T1
+	lxvd2x		vs18,	o32,	T1
+	lxvd2x		vs19,	o48,	T1
+	lxvd2x		vs20,	o0,	T2
+	lxvd2x		vs21,	o16,	T2
+	lxvd2x		vs22,	o32,	T2
+	lxvd2x		vs23,	o48,	T2
+
+#endif
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs49,	vs49			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs48		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs49		// imagA*imagB
+
+	xxswapd		vs48,	vs48			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs49,	vs49			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs48		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs49		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs51,	vs51			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs50		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs51		// imagA*imagB
+
+	xxswapd		vs50,	vs50			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs51,	vs51			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs50		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs51		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs9,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs53,	vs53			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs52		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs53		// imagA*imagB
+
+	xxswapd		vs52,	vs52			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs53,	vs53			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs52		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs53		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs10,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs55,	vs55			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs54		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs55		// imagA*imagB
+
+	xxswapd		vs54,	vs54			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs55,	vs55			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs54		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs55		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs11,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs57,	vs57			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs56		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs57		// imagA*imagB
+
+	xxswapd		vs56,	vs56			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs57,	vs57			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs56		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs57		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs12,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs59,	vs59			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs58		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs59		// imagA*imagB
+
+	xxswapd		vs58,	vs58			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs59,	vs59			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs58		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs59		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs13,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs61,	vs61			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs60		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs61		// imagA*imagB
+
+	xxswapd		vs60,	vs60			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs61,	vs61			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs60		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs61		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs14,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs63,	vs63			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs62		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs63		// imagA*imagB
+
+	xxswapd		vs62,	vs62			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs63,	vs63			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs62		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs63		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs15,	vs2,	vs3,	0	// merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+	xvadddp		vs8,	vs8,	vs16
+	xvadddp		vs9,	vs9,	vs17
+	xvadddp		vs10,	vs10,	vs18
+	xvadddp		vs11,	vs11,	vs19
+	xvadddp		vs12,	vs12,	vs20
+	xvadddp		vs13,	vs13,	vs21
+	xvadddp		vs14,	vs14,	vs22
+	xvadddp		vs15,	vs15,	vs23
+
+#endif
+
+	stxvd2x		vs8,	o0,	T1
+	stxvd2x		vs9,	o16,	T1
+	stxvd2x		vs10,	o32,	T1
+	stxvd2x		vs11,	o48,	T1
+	stxvd2x		vs12,	o0,	T2
+	stxvd2x		vs13,	o16,	T2
+	stxvd2x		vs14,	o32,	T2
+	stxvd2x		vs15,	o48,	T2
+
+	add		T1,	T1,	LDC
+	add		T2,	T2,	LDC
+	addi		CO,	CO,	128
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=4
+**********************************************************************************************/
+
+.macro LOAD2x4_1
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+
+.endm
+
+.macro KERNEL2x4_I1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+	lxvd2x		vs9,	o16,	AO		// load real,imag from A
+	lxvd2x		vs10,	o32,	AO		// load real,imag from A
+	lxvd2x		vs11,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+	lxvdsx		vs22,	o16,	BO		// load real part from B
+	lxvdsx		vs23,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
+	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmuldp		vs36,	vs2,	vs16		// real*real, imag*real
+	xvmuldp		vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmuldp		vs38,	vs3,	vs16		// real*real, imag*real
+	xvmuldp		vs39,	vs3,	vs17		// real*imag, imag*imag
+
+	xvmuldp		vs40,	vs0,	vs18		// real*real, imag*real
+	xvmuldp		vs41,	vs0,	vs19		// real*imag, imag*imag
+	xvmuldp		vs42,	vs1,	vs18		// real*real, imag*real
+	xvmuldp		vs43,	vs1,	vs19		// real*imag, imag*imag
+	xvmuldp		vs44,	vs2,	vs18		// real*real, imag*real
+	xvmuldp		vs45,	vs2,	vs19		// real*imag, imag*imag
+	xvmuldp		vs46,	vs3,	vs18		// real*real, imag*real
+	xvmuldp		vs47,	vs3,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x4_1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+	lxvd2x		vs9,	o16,	AO		// load real,imag from A
+	lxvd2x		vs10,	o32,	AO		// load real,imag from A
+	lxvd2x		vs11,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+	lxvdsx		vs22,	o16,	BO		// load real part from B
+	lxvdsx		vs23,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
+	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs36,	vs2,	vs16		// real*real, imag*real
+	xvmaddadp	vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs3,	vs16		// real*real, imag*real
+	xvmaddadp	vs39,	vs3,	vs17		// real*imag, imag*imag
+
+	xvmaddadp	vs40,	vs0,	vs18		// real*real, imag*real
+	xvmaddadp	vs41,	vs0,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs42,	vs1,	vs18		// real*real, imag*real
+	xvmaddadp	vs43,	vs1,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs44,	vs2,	vs18		// real*real, imag*real
+	xvmaddadp	vs45,	vs2,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs46,	vs3,	vs18		// real*real, imag*real
+	xvmaddadp	vs47,	vs3,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x4_2
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
+	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs36,	vs10,	vs20		// real*real, imag*real
+	xvmaddadp	vs37,	vs10,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs11,	vs20		// real*real, imag*real
+	xvmaddadp	vs39,	vs11,	vs21		// real*imag, imag*imag
+
+	xvmaddadp	vs40,	vs8,	vs22		// real*real, imag*real
+	xvmaddadp	vs41,	vs8,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs42,	vs9,	vs22		// real*real, imag*real
+	xvmaddadp	vs43,	vs9,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs44,	vs10,	vs22		// real*real, imag*real
+	xvmaddadp	vs45,	vs10,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs46,	vs11,	vs22		// real*real, imag*real
+	xvmaddadp	vs47,	vs11,	vs23		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x4_E2
+
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
+	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs36,	vs10,	vs20		// real*real, imag*real
+	xvmaddadp	vs37,	vs10,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs11,	vs20		// real*real, imag*real
+	xvmaddadp	vs39,	vs11,	vs21		// real*imag, imag*imag
+
+	xvmaddadp	vs40,	vs8,	vs22		// real*real, imag*real
+	xvmaddadp	vs41,	vs8,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs42,	vs9,	vs22		// real*real, imag*real
+	xvmaddadp	vs43,	vs9,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs44,	vs10,	vs22		// real*real, imag*real
+	xvmaddadp	vs45,	vs10,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs46,	vs11,	vs22		// real*real, imag*real
+	xvmaddadp	vs47,	vs11,	vs23		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x4_SUBI1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
+	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmuldp		vs36,	vs2,	vs16		// real*real, imag*real
+	xvmuldp		vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmuldp		vs38,	vs3,	vs16		// real*real, imag*real
+	xvmuldp		vs39,	vs3,	vs17		// real*imag, imag*imag
+
+	xvmuldp		vs40,	vs0,	vs18		// real*real, imag*real
+	xvmuldp		vs41,	vs0,	vs19		// real*imag, imag*imag
+	xvmuldp		vs42,	vs1,	vs18		// real*real, imag*real
+	xvmuldp		vs43,	vs1,	vs19		// real*imag, imag*imag
+	xvmuldp		vs44,	vs2,	vs18		// real*real, imag*real
+	xvmuldp		vs45,	vs2,	vs19		// real*imag, imag*imag
+	xvmuldp		vs46,	vs3,	vs18		// real*real, imag*real
+	xvmuldp		vs47,	vs3,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x4_SUB1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
+	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs36,	vs2,	vs16		// real*real, imag*real
+	xvmaddadp	vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs3,	vs16		// real*real, imag*real
+	xvmaddadp	vs39,	vs3,	vs17		// real*imag, imag*imag
+
+	xvmaddadp	vs40,	vs0,	vs18		// real*real, imag*real
+	xvmaddadp	vs41,	vs0,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs42,	vs1,	vs18		// real*real, imag*real
+	xvmaddadp	vs43,	vs1,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs44,	vs2,	vs18		// real*real, imag*real
+	xvmaddadp	vs45,	vs2,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs46,	vs3,	vs18		// real*real, imag*real
+	xvmaddadp	vs47,	vs3,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE2x4
+
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxvd2x		vs16,	o0,	T1
+	lxvd2x		vs17,	o16,	T1
+	lxvd2x		vs18,	o32,	T1
+	lxvd2x		vs19,	o48,	T1
+
+#endif
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs33,	vs33			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
+
+	xxswapd		vs32,	vs32			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs33,	vs33			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs35,	vs35			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs34		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs35		// imagA*imagB
+
+	xxswapd		vs34,	vs34			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs35,	vs35			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs34		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs35		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs9,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs37,	vs37			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs36		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs37		// imagA*imagB
+
+	xxswapd		vs36,	vs36			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs37,	vs37			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs36		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs37		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs10,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs39,	vs39			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs38		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs39		// imagA*imagB
+
+	xxswapd		vs38,	vs38			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs39,	vs39			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs38		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs39		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs11,	vs2,	vs3,	0	// merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+	xvadddp		vs8,	vs8,	vs16
+	xvadddp		vs9,	vs9,	vs17
+	xvadddp		vs10,	vs10,	vs18
+	xvadddp		vs11,	vs11,	vs19
+
+#endif
+
+	stxvd2x		vs8,	o0,	T1
+	stxvd2x		vs9,	o16,	T1
+	stxvd2x		vs10,	o32,	T1
+	stxvd2x		vs11,	o48,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+
+	lxvd2x		vs16,	o0,	T1
+	lxvd2x		vs17,	o16,	T1
+	lxvd2x		vs18,	o32,	T1
+	lxvd2x		vs19,	o48,	T1
+
+#endif
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs41,	vs41			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs40		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs41		// imagA*imagB
+
+	xxswapd		vs40,	vs40			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs41,	vs41			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs40		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs41		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs43,	vs43			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs42		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs43		// imagA*imagB
+
+	xxswapd		vs42,	vs42			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs43,	vs43			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs42		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs43		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs9,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs45,	vs45			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs44		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs45		// imagA*imagB
+
+	xxswapd		vs44,	vs44			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs45,	vs45			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs44		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs45		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs10,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs47,	vs47			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs46		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs47		// imagA*imagB
+
+	xxswapd		vs46,	vs46			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs47,	vs47			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs46		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs47		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs11,	vs2,	vs3,	0	// merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+	xvadddp		vs8,	vs8,	vs16
+	xvadddp		vs9,	vs9,	vs17
+	xvadddp		vs10,	vs10,	vs18
+	xvadddp		vs11,	vs11,	vs19
+
+#endif
+
+	stxvd2x		vs8,	o0,	T1
+	stxvd2x		vs9,	o16,	T1
+	stxvd2x		vs10,	o32,	T1
+	stxvd2x		vs11,	o48,	T1
+
+	add		T1,	T1,	LDC
+	addi		CO,	CO,	64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=2
+**********************************************************************************************/
+
+.macro LOAD2x2_1
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+
+	addi		AO,	AO,	32
+
+
+.endm
+
+.macro KERNEL2x2_I1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+	lxvd2x		vs9,	o16,	AO		// load real,imag from A
+
+	addi		AO,	AO,	32
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+	lxvdsx		vs22,	o16,	BO		// load real part from B
+	lxvdsx		vs23,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
+	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
+
+	xvmuldp		vs36,	vs0,	vs18		// real*real, imag*real
+	xvmuldp		vs37,	vs0,	vs19		// real*imag, imag*imag
+	xvmuldp		vs38,	vs1,	vs18		// real*real, imag*real
+	xvmuldp		vs39,	vs1,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x2_1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+	lxvd2x		vs9,	o16,	AO		// load real,imag from A
+
+	addi		AO,	AO,	32
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+	lxvdsx		vs22,	o16,	BO		// load real part from B
+	lxvdsx		vs23,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
+	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
+
+	xvmaddadp	vs36,	vs0,	vs18		// real*real, imag*real
+	xvmaddadp	vs37,	vs0,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs1,	vs18		// real*real, imag*real
+	xvmaddadp	vs39,	vs1,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x2_2
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+
+	addi		AO,	AO,	32
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
+	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
+
+	xvmaddadp	vs36,	vs8,	vs22		// real*real, imag*real
+	xvmaddadp	vs37,	vs8,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs9,	vs22		// real*real, imag*real
+	xvmaddadp	vs39,	vs9,	vs23		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x2_E2
+
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
+	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
+
+	xvmaddadp	vs36,	vs8,	vs22		// real*real, imag*real
+	xvmaddadp	vs37,	vs8,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs9,	vs22		// real*real, imag*real
+	xvmaddadp	vs39,	vs9,	vs23		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x2_SUBI1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+
+	addi		AO,	AO,	32
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
+	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
+
+	xvmuldp		vs36,	vs0,	vs18		// real*real, imag*real
+	xvmuldp		vs37,	vs0,	vs19		// real*imag, imag*imag
+	xvmuldp		vs38,	vs1,	vs18		// real*real, imag*real
+	xvmuldp		vs39,	vs1,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x2_SUB1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+
+	addi		AO,	AO,	32
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
+	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
+
+	xvmaddadp	vs36,	vs0,	vs18		// real*real, imag*real
+	xvmaddadp	vs37,	vs0,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs1,	vs18		// real*real, imag*real
+	xvmaddadp	vs39,	vs1,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE2x2
+
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxvd2x		vs16,	o0,	T1
+	lxvd2x		vs17,	o16,	T1
+
+#endif
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs33,	vs33			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
+
+	xxswapd		vs32,	vs32			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs33,	vs33			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs35,	vs35			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs34		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs35		// imagA*imagB
+
+	xxswapd		vs34,	vs34			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs35,	vs35			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs34		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs35		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs9,	vs2,	vs3,	0	// merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+	xvadddp		vs8,	vs8,	vs16
+	xvadddp		vs9,	vs9,	vs17
+
+#endif
+
+	stxvd2x		vs8,	o0,	T1
+	stxvd2x		vs9,	o16,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+
+	lxvd2x		vs16,	o0,	T1
+	lxvd2x		vs17,	o16,	T1
+
+#endif
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs37,	vs37			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs36		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs37		// imagA*imagB
+
+	xxswapd		vs36,	vs36			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs37,	vs37			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs36		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs37		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs39,	vs39			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs38		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs39		// imagA*imagB
+
+	xxswapd		vs38,	vs38			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs39,	vs39			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs38		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs39		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs9,	vs2,	vs3,	0	// merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+	xvadddp		vs8,	vs8,	vs16
+	xvadddp		vs9,	vs9,	vs17
+
+#endif
+
+	stxvd2x		vs8,	o0,	T1
+	stxvd2x		vs9,	o16,	T1
+
+	add		T1,	T1,	LDC
+	addi		CO,	CO,	32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=1
+**********************************************************************************************/
+
+.macro LOAD2x1_1
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+
+	addi		AO,	AO,	16
+
+
+.endm
+
+.macro KERNEL2x1_I1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+
+	addi		AO,	AO,	16
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+	lxvdsx		vs22,	o16,	BO		// load real part from B
+	lxvdsx		vs23,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+
+	xvmuldp		vs34,	vs0,	vs18		// real*real, imag*real
+	xvmuldp		vs35,	vs0,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x1_1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+
+	addi		AO,	AO,	16
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+	lxvdsx		vs22,	o16,	BO		// load real part from B
+	lxvdsx		vs23,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+
+	xvmaddadp	vs34,	vs0,	vs18		// real*real, imag*real
+	xvmaddadp	vs35,	vs0,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x1_2
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+
+	addi		AO,	AO,	16
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+
+	xvmaddadp	vs34,	vs8,	vs22		// real*real, imag*real
+	xvmaddadp	vs35,	vs8,	vs23		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x1_E2
+
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+
+	xvmaddadp	vs34,	vs8,	vs22		// real*real, imag*real
+	xvmaddadp	vs35,	vs8,	vs23		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x1_SUBI1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+
+	addi		AO,	AO,	16
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+
+	xvmuldp		vs34,	vs0,	vs18		// real*real, imag*real
+	xvmuldp		vs35,	vs0,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x1_SUB1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+
+	addi		AO,	AO,	16
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+
+	xvmaddadp	vs34,	vs0,	vs18		// real*real, imag*real
+	xvmaddadp	vs35,	vs0,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE2x1
+
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxvd2x		vs16,	o0,	T1
+
+#endif
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs33,	vs33			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
+
+	xxswapd		vs32,	vs32			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs33,	vs33			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+	xvadddp		vs8,	vs8,	vs16
+
+#endif
+
+	stxvd2x		vs8,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+
+	lxvd2x		vs16,	o0,	T1
+
+#endif
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs35,	vs35			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs34		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs35		// imagA*imagB
+
+	xxswapd		vs34,	vs34			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs35,	vs35			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs34		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs35		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+	xvadddp		vs8,	vs8,	vs16
+
+#endif
+
+	stxvd2x		vs8,	o0,	T1
+
+	add		T1,	T1,	LDC
+	addi		CO,	CO,	16
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=8
+**********************************************************************************************/
+
+.macro LOAD1x8_1
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvd2x		vs4,	o0,	AO		// load real,imag from A
+	lxvd2x		vs5,	o16,	AO		// load real,imag from A
+	lxvd2x		vs6,	o32,	AO		// load real,imag from A
+	lxvd2x		vs7,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+
+.endm
+
+.macro KERNEL1x8_I1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+	lxvd2x		vs9,	o16,	AO		// load real,imag from A
+	lxvd2x		vs10,	o32,	AO		// load real,imag from A
+	lxvd2x		vs11,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvd2x		vs12,	o0,	AO		// load real,imag from A
+	lxvd2x		vs13,	o16,	AO		// load real,imag from A
+	lxvd2x		vs14,	o32,	AO		// load real,imag from A
+	lxvd2x		vs15,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
+	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmuldp		vs36,	vs2,	vs16		// real*real, imag*real
+	xvmuldp		vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmuldp		vs38,	vs3,	vs16		// real*real, imag*real
+	xvmuldp		vs39,	vs3,	vs17		// real*imag, imag*imag
+	xvmuldp		vs40,	vs4,	vs16		// real*real, imag*real
+	xvmuldp		vs41,	vs4,	vs17		// real*imag, imag*imag
+	xvmuldp		vs42,	vs5,	vs16		// real*real, imag*real
+	xvmuldp		vs43,	vs5,	vs17		// real*imag, imag*imag
+	xvmuldp		vs44,	vs6,	vs16		// real*real, imag*real
+	xvmuldp		vs45,	vs6,	vs17		// real*imag, imag*imag
+	xvmuldp		vs46,	vs7,	vs16		// real*real, imag*real
+	xvmuldp		vs47,	vs7,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x8_1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+	lxvd2x		vs9,	o16,	AO		// load real,imag from A
+	lxvd2x		vs10,	o32,	AO		// load real,imag from A
+	lxvd2x		vs11,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvd2x		vs12,	o0,	AO		// load real,imag from A
+	lxvd2x		vs13,	o16,	AO		// load real,imag from A
+	lxvd2x		vs14,	o32,	AO		// load real,imag from A
+	lxvd2x		vs15,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
+	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs36,	vs2,	vs16		// real*real, imag*real
+	xvmaddadp	vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs3,	vs16		// real*real, imag*real
+	xvmaddadp	vs39,	vs3,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs40,	vs4,	vs16		// real*real, imag*real
+	xvmaddadp	vs41,	vs4,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs42,	vs5,	vs16		// real*real, imag*real
+	xvmaddadp	vs43,	vs5,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs44,	vs6,	vs16		// real*real, imag*real
+	xvmaddadp	vs45,	vs6,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs46,	vs7,	vs16		// real*real, imag*real
+	xvmaddadp	vs47,	vs7,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x8_2
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvd2x		vs4,	o0,	AO		// load real,imag from A
+	lxvd2x		vs5,	o16,	AO		// load real,imag from A
+	lxvd2x		vs6,	o32,	AO		// load real,imag from A
+	lxvd2x		vs7,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
+	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs36,	vs10,	vs20		// real*real, imag*real
+	xvmaddadp	vs37,	vs10,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs11,	vs20		// real*real, imag*real
+	xvmaddadp	vs39,	vs11,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs40,	vs12,	vs20		// real*real, imag*real
+	xvmaddadp	vs41,	vs12,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs42,	vs13,	vs20		// real*real, imag*real
+	xvmaddadp	vs43,	vs13,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs44,	vs14,	vs20		// real*real, imag*real
+	xvmaddadp	vs45,	vs14,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs46,	vs15,	vs20		// real*real, imag*real
+	xvmaddadp	vs47,	vs15,	vs21		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x8_E2
+
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
+	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs36,	vs10,	vs20		// real*real, imag*real
+	xvmaddadp	vs37,	vs10,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs11,	vs20		// real*real, imag*real
+	xvmaddadp	vs39,	vs11,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs40,	vs12,	vs20		// real*real, imag*real
+	xvmaddadp	vs41,	vs12,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs42,	vs13,	vs20		// real*real, imag*real
+	xvmaddadp	vs43,	vs13,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs44,	vs14,	vs20		// real*real, imag*real
+	xvmaddadp	vs45,	vs14,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs46,	vs15,	vs20		// real*real, imag*real
+	xvmaddadp	vs47,	vs15,	vs21		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x8_SUBI1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvd2x		vs4,	o0,	AO		// load real,imag from A
+	lxvd2x		vs5,	o16,	AO		// load real,imag from A
+	lxvd2x		vs6,	o32,	AO		// load real,imag from A
+	lxvd2x		vs7,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
+	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmuldp		vs36,	vs2,	vs16		// real*real, imag*real
+	xvmuldp		vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmuldp		vs38,	vs3,	vs16		// real*real, imag*real
+	xvmuldp		vs39,	vs3,	vs17		// real*imag, imag*imag
+	xvmuldp		vs40,	vs4,	vs16		// real*real, imag*real
+	xvmuldp		vs41,	vs4,	vs17		// real*imag, imag*imag
+	xvmuldp		vs42,	vs5,	vs16		// real*real, imag*real
+	xvmuldp		vs43,	vs5,	vs17		// real*imag, imag*imag
+	xvmuldp		vs44,	vs6,	vs16		// real*real, imag*real
+	xvmuldp		vs45,	vs6,	vs17		// real*imag, imag*imag
+	xvmuldp		vs46,	vs7,	vs16		// real*real, imag*real
+	xvmuldp		vs47,	vs7,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x8_SUB1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvd2x		vs4,	o0,	AO		// load real,imag from A
+	lxvd2x		vs5,	o16,	AO		// load real,imag from A
+	lxvd2x		vs6,	o32,	AO		// load real,imag from A
+	lxvd2x		vs7,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
+	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs36,	vs2,	vs16		// real*real, imag*real
+	xvmaddadp	vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs3,	vs16		// real*real, imag*real
+	xvmaddadp	vs39,	vs3,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs40,	vs4,	vs16		// real*real, imag*real
+	xvmaddadp	vs41,	vs4,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs42,	vs5,	vs16		// real*real, imag*real
+	xvmaddadp	vs43,	vs5,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs44,	vs6,	vs16		// real*real, imag*real
+	xvmaddadp	vs45,	vs6,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs46,	vs7,	vs16		// real*real, imag*real
+	xvmaddadp	vs47,	vs7,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE1x8
+
+
+	mr		T1,	CO
+	addi		T2,	T1,	64
+
+#ifndef TRMMKERNEL
+
+	lxvd2x		vs16,	o0,	T1
+	lxvd2x		vs17,	o16,	T1
+	lxvd2x		vs18,	o32,	T1
+	lxvd2x		vs19,	o48,	T1
+	lxvd2x		vs20,	o0,	T2
+	lxvd2x		vs21,	o16,	T2
+	lxvd2x		vs22,	o32,	T2
+	lxvd2x		vs23,	o48,	T2
+
+#endif
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs33,	vs33			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
+
+	xxswapd		vs32,	vs32			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs33,	vs33			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs35,	vs35			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs34		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs35		// imagA*imagB
+
+	xxswapd		vs34,	vs34			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs35,	vs35			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs34		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs35		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs9,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs37,	vs37			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs36		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs37		// imagA*imagB
+
+	xxswapd		vs36,	vs36			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs37,	vs37			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs36		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs37		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs10,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs39,	vs39			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs38		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs39		// imagA*imagB
+
+	xxswapd		vs38,	vs38			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs39,	vs39			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs38		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs39		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs11,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs41,	vs41			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs40		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs41		// imagA*imagB
+
+	xxswapd		vs40,	vs40			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs41,	vs41			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs40		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs41		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs12,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs43,	vs43			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs42		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs43		// imagA*imagB
+
+	xxswapd		vs42,	vs42			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs43,	vs43			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs42		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs43		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs13,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs45,	vs45			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs44		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs45		// imagA*imagB
+
+	xxswapd		vs44,	vs44			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs45,	vs45			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs44		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs45		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs14,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs47,	vs47			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs46		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs47		// imagA*imagB
+
+	xxswapd		vs46,	vs46			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs47,	vs47			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs46		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs47		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs15,	vs2,	vs3,	0	// merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+	xvadddp		vs8,	vs8,	vs16
+	xvadddp		vs9,	vs9,	vs17
+	xvadddp		vs10,	vs10,	vs18
+	xvadddp		vs11,	vs11,	vs19
+	xvadddp		vs12,	vs12,	vs20
+	xvadddp		vs13,	vs13,	vs21
+	xvadddp		vs14,	vs14,	vs22
+	xvadddp		vs15,	vs15,	vs23
+
+#endif
+
+	stxvd2x		vs8,	o0,	T1
+	stxvd2x		vs9,	o16,	T1
+	stxvd2x		vs10,	o32,	T1
+	stxvd2x		vs11,	o48,	T1
+	stxvd2x		vs12,	o0,	T2
+	stxvd2x		vs13,	o16,	T2
+	stxvd2x		vs14,	o32,	T2
+	stxvd2x		vs15,	o48,	T2
+
+	add		T1,	T1,	LDC
+	add		T2,	T2,	LDC
+	addi		CO,	CO,	128
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=4
+**********************************************************************************************/
+
+.macro LOAD1x4_1
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+
+.endm
+
+.macro KERNEL1x4_I1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+	lxvd2x		vs9,	o16,	AO		// load real,imag from A
+	lxvd2x		vs10,	o32,	AO		// load real,imag from A
+	lxvd2x		vs11,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
+	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmuldp		vs36,	vs2,	vs16		// real*real, imag*real
+	xvmuldp		vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmuldp		vs38,	vs3,	vs16		// real*real, imag*real
+	xvmuldp		vs39,	vs3,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x4_1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+	lxvd2x		vs9,	o16,	AO		// load real,imag from A
+	lxvd2x		vs10,	o32,	AO		// load real,imag from A
+	lxvd2x		vs11,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
+	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs36,	vs2,	vs16		// real*real, imag*real
+	xvmaddadp	vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs3,	vs16		// real*real, imag*real
+	xvmaddadp	vs39,	vs3,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x4_2
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
+	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs36,	vs10,	vs20		// real*real, imag*real
+	xvmaddadp	vs37,	vs10,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs11,	vs20		// real*real, imag*real
+	xvmaddadp	vs39,	vs11,	vs21		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x4_E2
+
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
+	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs36,	vs10,	vs20		// real*real, imag*real
+	xvmaddadp	vs37,	vs10,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs11,	vs20		// real*real, imag*real
+	xvmaddadp	vs39,	vs11,	vs21		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x4_SUBI1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
+	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmuldp		vs36,	vs2,	vs16		// real*real, imag*real
+	xvmuldp		vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmuldp		vs38,	vs3,	vs16		// real*real, imag*real
+	xvmuldp		vs39,	vs3,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x4_SUB1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
+	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs36,	vs2,	vs16		// real*real, imag*real
+	xvmaddadp	vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs3,	vs16		// real*real, imag*real
+	xvmaddadp	vs39,	vs3,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE1x4
+
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxvd2x		vs16,	o0,	T1
+	lxvd2x		vs17,	o16,	T1
+	lxvd2x		vs18,	o32,	T1
+	lxvd2x		vs19,	o48,	T1
+
+#endif
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs33,	vs33			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
+
+	xxswapd		vs32,	vs32			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs33,	vs33			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs35,	vs35			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs34		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs35		// imagA*imagB
+
+	xxswapd		vs34,	vs34			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs35,	vs35			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs34		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs35		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs9,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs37,	vs37			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs36		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs37		// imagA*imagB
+
+	xxswapd		vs36,	vs36			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs37,	vs37			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs36		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs37		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs10,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs39,	vs39			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs38		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs39		// imagA*imagB
+
+	xxswapd		vs38,	vs38			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs39,	vs39			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs38		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs39		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs11,	vs2,	vs3,	0	// merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+	xvadddp		vs8,	vs8,	vs16
+	xvadddp		vs9,	vs9,	vs17
+	xvadddp		vs10,	vs10,	vs18
+	xvadddp		vs11,	vs11,	vs19
+
+#endif
+
+	stxvd2x		vs8,	o0,	T1
+	stxvd2x		vs9,	o16,	T1
+	stxvd2x		vs10,	o32,	T1
+	stxvd2x		vs11,	o48,	T1
+
+	add		T1,	T1,	LDC
+	addi		CO,	CO,	64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=2
+**********************************************************************************************/
+
+.macro LOAD1x2_1
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+
+	addi		AO,	AO,	32
+
+
+.endm
+
+.macro KERNEL1x2_I1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+	lxvd2x		vs9,	o16,	AO		// load real,imag from A
+
+	addi		AO,	AO,	32
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
+	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x2_1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+	lxvd2x		vs9,	o16,	AO		// load real,imag from A
+
+	addi		AO,	AO,	32
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
+	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x2_2
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+
+	addi		AO,	AO,	32
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
+	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x2_E2
+
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
+	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x2_SUBI1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+
+	addi		AO,	AO,	32
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
+	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x2_SUB1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+
+	addi		AO,	AO,	32
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
+	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE1x2
+
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxvd2x		vs16,	o0,	T1
+	lxvd2x		vs17,	o16,	T1
+
+#endif
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs33,	vs33			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
+
+	xxswapd		vs32,	vs32			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs33,	vs33			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs35,	vs35			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs34		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs35		// imagA*imagB
+
+	xxswapd		vs34,	vs34			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs35,	vs35			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs34		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs35		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs9,	vs2,	vs3,	0	// merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+	xvadddp		vs8,	vs8,	vs16
+	xvadddp		vs9,	vs9,	vs17
+
+#endif
+
+	stxvd2x		vs8,	o0,	T1
+	stxvd2x		vs9,	o16,	T1
+
+	add		T1,	T1,	LDC
+	addi		CO,	CO,	32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=1
+**********************************************************************************************/
+
+.macro LOAD1x1_1
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+
+	addi		AO,	AO,	16
+
+
+.endm
+
+.macro KERNEL1x1_I1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+
+	addi		AO,	AO,	16
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x1_1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+
+	addi		AO,	AO,	16
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x1_2
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+
+	addi		AO,	AO,	16
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x1_E2
+
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x1_SUBI1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+
+	addi		AO,	AO,	16
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x1_SUB1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+
+	addi		AO,	AO,	16
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE1x1
+
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxvd2x		vs16,	o0,	T1
+
+#endif
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs33,	vs33			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
+
+	xxswapd		vs32,	vs32			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs33,	vs33			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+	xvadddp		vs8,	vs8,	vs16
+
+#endif
+
+	stxvd2x		vs8,	o0,	T1
+
+	add		T1,	T1,	LDC
+	addi		CO,	CO,	16
+
+.endm
+
diff --git a/kernel/power/zgemv_n.S b/kernel/power/zgemv_n.S
index f934399..23e0177 100644
--- a/kernel/power/zgemv_n.S
+++ b/kernel/power/zgemv_n.S
@@ -170,11 +170,6 @@
 #define PREFETCHSIZE_C  24
 #endif
 
-#ifdef POWER8
-#define PREFETCHSIZE_A  24
-#define PREFETCHSIZE_C  24
-#endif
-
 #ifndef XCONJ
 #define FMADDR FMADD
 #define FMSUBR FNMSUB
diff --git a/kernel/power/zgemv_t.S b/kernel/power/zgemv_t.S
index 2b45014..c0bad31 100644
--- a/kernel/power/zgemv_t.S
+++ b/kernel/power/zgemv_t.S
@@ -144,11 +144,6 @@
 #define PREFETCHSIZE_C   8
 #endif
 
-#ifdef POWER8
-#define PREFETCHSIZE_A  24
-#define PREFETCHSIZE_C   8
-#endif
-
 #if !(defined(CONJ) && defined(XCONJ))
 #define FMADDR FMADD
 #define FMSUBR FNMSUB
diff --git a/kernel/power/zsymv_L.S b/kernel/power/zsymv_L.S
index 394c030..b348e32 100644
--- a/kernel/power/zsymv_L.S
+++ b/kernel/power/zsymv_L.S
@@ -169,11 +169,7 @@
 #define PREFETCHSIZE_A  112
 #endif
 
-#ifdef POWER8
-#define PREFETCHSIZE_A  112
-#endif
-
-#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970) || defined(POWER8)
+#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970)
 #define NOP1
 #define NOP2
 #else
diff --git a/kernel/power/zsymv_U.S b/kernel/power/zsymv_U.S
index a061cd7..b631cbe 100644
--- a/kernel/power/zsymv_U.S
+++ b/kernel/power/zsymv_U.S
@@ -166,11 +166,7 @@
 #define PREFETCHSIZE_A  112
 #endif
 
-#ifdef POWER8
-#define PREFETCHSIZE_A  112
-#endif
-
-#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970) || defined(POWER8)
+#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970)
 #define NOP1
 #define NOP2
 #else
diff --git a/kernel/power/ztrmm_kernel_8x2_power8.S b/kernel/power/ztrmm_kernel_8x2_power8.S
new file mode 100644
index 0000000..dbbc8f9
--- /dev/null
+++ b/kernel/power/ztrmm_kernel_8x2_power8.S
@@ -0,0 +1,342 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+#ifndef __64BIT__
+#define LOAD	lwz
+#else
+#define LOAD	ld
+#endif
+
+#ifdef __64BIT__
+#define STACKSIZE 320
+#define ALPHA_R_SP 296(SP)
+#define ALPHA_I_SP 304(SP)
+#define FZERO	312(SP)
+#else
+#define STACKSIZE 256
+#define ALPHA_R_SP 224(SP)
+#define ALPHA_I_SP 232(SP)
+#define FZERO	240(SP)
+#endif
+
+#define	M	r3
+#define	N	r4
+#define	K	r5
+
+#ifdef linux
+#ifndef __64BIT__
+#define A	r6
+#define	B	r7
+#define	C	r8
+#define	LDC	r9
+#define OFFSET	r10
+#else
+#define A	r8
+#define	B	r9
+#define	C	r10
+#define	LDC	r6
+#define OFFSET	r7
+#endif
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#if !defined(__64BIT__) && defined(DOUBLE)
+#define A	r10
+#define	B	r6
+#define	C	r7
+#define	LDC	r8
+#define OFFSET	r9
+#else
+#define A	r8
+#define	B	r9
+#define	C	r10
+#define	LDC	r6
+#define OFFSET	r7
+#endif
+#endif
+
+#define o0	0
+#define alpha_r vs30
+#define alpha_i vs31
+
+#define KKK	r13
+#define K1	r14
+#define L	r15
+#define ALPHA	r16
+#define o24	r17
+#define T2	r19
+#define KK	r20
+#define	o8	r21
+#define	I	r22
+#define J	r23
+#define AO	r24
+#define	BO	r25
+#define	CO	r26
+#define o16	r27
+#define	o32	r28
+#define o48	r29
+
+#define PRE	r30
+#define T1  	r31
+
+#ifndef NEEDPARAM
+
+	PROLOGUE
+	PROFCODE
+
+	addi	SP, SP, -STACKSIZE
+	li	r0, 0
+
+	stfd	f14,    0(SP)
+	stfd	f15,    8(SP)
+	stfd	f16,   16(SP)
+	stfd	f17,   24(SP)
+
+	stfd	f18,   32(SP)
+	stfd	f19,   40(SP)
+	stfd	f20,   48(SP)
+	stfd	f21,   56(SP)
+
+	stfd	f22,   64(SP)
+	stfd	f23,   72(SP)
+	stfd	f24,   80(SP)
+	stfd	f25,   88(SP)
+
+	stfd	f26,   96(SP)
+	stfd	f27,  104(SP)
+	stfd	f28,  112(SP)
+	stfd	f29,  120(SP)
+
+	stfd	f30,  128(SP)
+	stfd	f31,  136(SP)
+
+#ifdef __64BIT__
+	std	r31,  144(SP)
+	std	r30,  152(SP)
+	std	r29,  160(SP)
+	std	r28,  168(SP)
+	std	r27,  176(SP)
+	std	r26,  184(SP)
+	std	r25,  192(SP)
+	std	r24,  200(SP)
+	std	r23,  208(SP)
+	std	r22,  216(SP)
+	std	r21,  224(SP)
+	std	r20,  232(SP)
+	std	r19,  240(SP)
+	std	r18,  248(SP)
+	std	r17,  256(SP)
+	std	r16,  264(SP)
+	std	r15,  272(SP)
+	std	r14,  280(SP)
+	std	r13,  288(SP)
+#else
+	stw	r31,  144(SP)
+	stw	r30,  148(SP)
+	stw	r29,  152(SP)
+	stw	r28,  156(SP)
+	stw	r27,  160(SP)
+	stw	r26,  164(SP)
+	stw	r25,  168(SP)
+	stw	r24,  172(SP)
+	stw	r23,  176(SP)
+	stw	r22,  180(SP)
+	stw	r21,  184(SP)
+	stw	r20,  188(SP)
+	stw	r19,  192(SP)
+	stw	r18,  196(SP)
+	stw	r17,  200(SP)
+	stw	r16,  204(SP)
+	stw	r15,  208(SP)
+	stw	r14,  212(SP)
+	stw	r13,  216(SP)
+#endif
+
+	stfd	f1,  ALPHA_R_SP
+	stfd	f2,  ALPHA_I_SP
+	stw	r0,  FZERO
+
+#ifdef linux
+#ifdef __64BIT__
+	ld	LDC, FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#ifdef __64BIT__
+	ld	LDC, FRAMESLOT(0) + STACKSIZE(SP)
+#else
+#ifdef DOUBLE
+	lwz	B,   FRAMESLOT(0) + STACKSIZE(SP)
+	lwz	C,   FRAMESLOT(1) + STACKSIZE(SP)
+	lwz	LDC, FRAMESLOT(2) + STACKSIZE(SP)
+#else
+	lwz	LDC, FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+#endif
+#endif
+
+#ifdef TRMMKERNEL
+#if defined(linux) && defined(__64BIT__)
+	ld	OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#ifdef __64BIT__
+	ld	OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
+#else
+#ifdef DOUBLE
+	lwz	OFFSET,  FRAMESLOT(3) + STACKSIZE(SP)
+#else
+	lwz	OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
+#endif
+#endif
+#endif
+#if defined(TRMMKERNEL) && !defined(LEFT)
+	neg	KK, OFFSET
+#endif
+#endif
+
+#include "zgemm_macros_8x2_power8.S"
+
+	cmpwi	cr0, M, 0
+	ble	L999
+	cmpwi	cr0, N, 0
+	ble	L999
+	cmpwi	cr0, K, 0
+	ble	L999
+
+	slwi	LDC, LDC, ZBASE_SHIFT
+	li	PRE, 256 
+	li	o8  , 8
+	li	o16 , 16
+	li	o24 , 24
+	li	o32 , 32
+	li	o48 , 48
+
+#ifdef __64BIT__
+	addi	ALPHA, SP, 296
+#else
+	addi	ALPHA, SP, 224
+#endif
+
+	lxsdx	alpha_r, 0, ALPHA
+	lxsdx	alpha_i, o8, ALPHA
+
+	.align 4
+
+#include "ztrmm_logic_8x2_power8.S"
+
+L999:
+	addi	r3, 0, 0
+
+	lfd	f14,    0(SP)
+	lfd	f15,    8(SP)
+	lfd	f16,   16(SP)
+	lfd	f17,   24(SP)
+
+	lfd	f18,   32(SP)
+	lfd	f19,   40(SP)
+	lfd	f20,   48(SP)
+	lfd	f21,   56(SP)
+
+	lfd	f22,   64(SP)
+	lfd	f23,   72(SP)
+	lfd	f24,   80(SP)
+	lfd	f25,   88(SP)
+
+	lfd	f26,   96(SP)
+	lfd	f27,  104(SP)
+	lfd	f28,  112(SP)
+	lfd	f29,  120(SP)
+
+	lfd	f30,  128(SP)
+	lfd	f31,  136(SP)
+
+#ifdef __64BIT__
+	ld	r31,  144(SP)
+	ld	r30,  152(SP)
+	ld	r29,  160(SP)
+	ld	r28,  168(SP)
+	ld	r27,  176(SP)
+	ld	r26,  184(SP)
+	ld	r25,  192(SP)
+	ld	r24,  200(SP)
+	ld	r23,  208(SP)
+	ld	r22,  216(SP)
+	ld	r21,  224(SP)
+	ld	r20,  232(SP)
+	ld	r19,  240(SP)
+	ld	r18,  248(SP)
+	ld	r17,  256(SP)
+	ld	r16,  264(SP)
+	ld	r15,  272(SP)
+	ld	r14,  280(SP)
+	ld	r13,  288(SP)
+#else
+	lwz	r31,  144(SP)
+	lwz	r30,  148(SP)
+	lwz	r29,  152(SP)
+	lwz	r28,  156(SP)
+	lwz	r27,  160(SP)
+	lwz	r26,  164(SP)
+	lwz	r25,  168(SP)
+	lwz	r24,  172(SP)
+	lwz	r23,  176(SP)
+	lwz	r22,  180(SP)
+	lwz	r21,  184(SP)
+	lwz	r20,  188(SP)
+	lwz	r19,  192(SP)
+	lwz	r18,  196(SP)
+	lwz	r17,  200(SP)
+	lwz	r16,  204(SP)
+	lwz	r15,  208(SP)
+	lwz	r14,  212(SP)
+	lwz	r13,  216(SP)
+#endif
+
+	addi	SP, SP, STACKSIZE
+
+	blr
+
+	EPILOGUE
+#endif
diff --git a/kernel/power/ztrmm_logic_8x2_power8.S b/kernel/power/ztrmm_logic_8x2_power8.S
new file mode 100644
index 0000000..e250dfa
--- /dev/null
+++ b/kernel/power/ztrmm_logic_8x2_power8.S
@@ -0,0 +1,1201 @@
+	srawi.		J,	N,	1
+	ble		ZTRMM_L2_END
+
+ZTRMM_L2_BEGIN:
+
+	mr		CO,	C
+	mr		AO,	A
+	slwi		T1,	LDC	,	1
+	add		C,	C,	T1
+
+#if defined(LEFT)
+	mr		KK,	OFFSET		// OFFSET -> KK
+#endif
+
+	srawi.		I,	M,	3
+	ble		ZTRMM_L2x8_END
+
+ZTRMM_L2x8_BEGIN:
+
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	5				// Number of values in B shifted
+	slwi		T2,	KK,	7				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	8				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	2				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		ZTRMM_L2x8_SUB0
+	cmpwi		cr0,	L,	1
+	ble		ZTRMM_L2x8_SUB4
+
+ZTRMM_L2x8_LOOP_START:
+
+	dcbt		AO,	PRE
+	LOAD2x8_1
+	dcbt		AO,	PRE
+	KERNEL2x8_I1
+	dcbt		AO,	PRE
+	KERNEL2x8_2
+	dcbt		AO,	PRE
+	KERNEL2x8_1
+	dcbt		AO,	PRE
+	KERNEL2x8_2
+
+	dcbt		AO,	PRE
+	KERNEL2x8_1
+	dcbt		AO,	PRE
+	KERNEL2x8_2
+	dcbt		AO,	PRE
+	KERNEL2x8_1
+	dcbt		AO,	PRE
+	KERNEL2x8_2
+
+	addic.		L,	L,	-2
+	ble		ZTRMM_L2x8_LOOP_END
+
+	.align 5
+
+ZTRMM_L2x8_LOOP:
+
+	dcbt		AO,	PRE
+	KERNEL2x8_1
+	dcbt		AO,	PRE
+	KERNEL2x8_2
+	dcbt		AO,	PRE
+	KERNEL2x8_1
+	dcbt		AO,	PRE
+	KERNEL2x8_2
+
+	dcbt		AO,	PRE
+	KERNEL2x8_1
+	dcbt		AO,	PRE
+	KERNEL2x8_2
+	dcbt		AO,	PRE
+	KERNEL2x8_1
+	dcbt		AO,	PRE
+	KERNEL2x8_2
+
+	addic.		L,	L,	-1
+	bgt		ZTRMM_L2x8_LOOP
+
+ZTRMM_L2x8_LOOP_END:
+
+	dcbt		AO,	PRE
+	KERNEL2x8_1
+	dcbt		AO,	PRE
+	KERNEL2x8_2
+	dcbt		AO,	PRE
+	KERNEL2x8_1
+	dcbt		AO,	PRE
+	KERNEL2x8_2
+
+	dcbt		AO,	PRE
+	KERNEL2x8_1
+	dcbt		AO,	PRE
+	KERNEL2x8_2
+	dcbt		AO,	PRE
+	KERNEL2x8_1
+	KERNEL2x8_E2
+
+	b		ZTRMM_L2x8_SUB1
+
+ZTRMM_L2x8_SUB4:
+
+	dcbt		AO,	PRE
+	KERNEL2x8_SUBI1
+	dcbt		AO,	PRE
+	KERNEL2x8_SUB1
+	dcbt		AO,	PRE
+	KERNEL2x8_SUB1
+	dcbt		AO,	PRE
+	KERNEL2x8_SUB1
+
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+
+	b		ZTRMM_L2x8_SUB1
+
+ZTRMM_L2x8_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL2x8_SUBI1
+
+	addic.		L,	L,	-1
+	ble		ZTRMM_L2x8_SAVE
+	b		ZTRMM_L2x8_SUB2
+
+ZTRMM_L2x8_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		ZTRMM_L2x8_SAVE
+
+ZTRMM_L2x8_SUB2:
+
+	KERNEL2x8_SUB1
+
+	addic.		L,	L,	-1
+	bgt		ZTRMM_L2x8_SUB2
+
+ZTRMM_L2x8_SAVE:
+
+	SAVE2x8
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	5			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	7			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	8				// KK += Number of values in A
+#endif
+
+
+	addic.		I,	I,	-1
+	bgt		ZTRMM_L2x8_BEGIN
+
+ZTRMM_L2x8_END:
+
+ZTRMM_L2x4_BEGIN:
+	andi.		T2,	M,	7
+	ble		ZTRMM_L2x1_END
+
+	andi.		T1,	M,	4
+	ble		ZTRMM_L2x4_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	5				// Number of values in B shifted
+	slwi		T2,	KK,	6				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	4				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	2				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		ZTRMM_L2x4_SUB0
+	cmpwi		cr0,	L,	1
+	ble		ZTRMM_L2x4_SUB4
+
+ZTRMM_L2x4_LOOP_START:
+
+	LOAD2x4_1
+	KERNEL2x4_I1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	addic.		L,	L,	-2
+	ble		ZTRMM_L2x4_LOOP_END
+
+	.align 5
+
+ZTRMM_L2x4_LOOP:
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	addic.		L,	L,	-1
+	bgt		ZTRMM_L2x4_LOOP
+
+ZTRMM_L2x4_LOOP_END:
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_E2
+
+	b		ZTRMM_L2x4_SUB1
+
+ZTRMM_L2x4_SUB4:
+
+	KERNEL2x4_SUBI1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+
+	b		ZTRMM_L2x4_SUB1
+
+ZTRMM_L2x4_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL2x4_SUBI1
+
+	addic.		L,	L,	-1
+	ble		ZTRMM_L2x4_SAVE
+	b		ZTRMM_L2x4_SUB2
+
+ZTRMM_L2x4_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		ZTRMM_L2x4_SAVE
+
+ZTRMM_L2x4_SUB2:
+
+	KERNEL2x4_SUB1
+
+	addic.		L,	L,	-1
+	bgt		ZTRMM_L2x4_SUB2
+
+ZTRMM_L2x4_SAVE:
+
+	SAVE2x4
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	5			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	6			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	4				// KK += Number of values in A
+#endif
+
+
+ZTRMM_L2x4_END:
+
+ZTRMM_L2x2_BEGIN:
+
+	andi.		T1,	M,	2
+	ble		ZTRMM_L2x2_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	5				// Number of values in B shifted
+	slwi		T2,	KK,	5				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	2				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	2				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		ZTRMM_L2x2_SUB0
+	cmpwi		cr0,	L,	1
+	ble		ZTRMM_L2x2_SUB4
+
+ZTRMM_L2x2_LOOP_START:
+
+	LOAD2x2_1
+	KERNEL2x2_I1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	addic.		L,	L,	-2
+	ble		ZTRMM_L2x2_LOOP_END
+
+	.align 5
+
+ZTRMM_L2x2_LOOP:
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	addic.		L,	L,	-1
+	bgt		ZTRMM_L2x2_LOOP
+
+ZTRMM_L2x2_LOOP_END:
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_E2
+
+	b		ZTRMM_L2x2_SUB1
+
+ZTRMM_L2x2_SUB4:
+
+	KERNEL2x2_SUBI1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+
+	b		ZTRMM_L2x2_SUB1
+
+ZTRMM_L2x2_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL2x2_SUBI1
+
+	addic.		L,	L,	-1
+	ble		ZTRMM_L2x2_SAVE
+	b		ZTRMM_L2x2_SUB2
+
+ZTRMM_L2x2_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		ZTRMM_L2x2_SAVE
+
+ZTRMM_L2x2_SUB2:
+
+	KERNEL2x2_SUB1
+
+	addic.		L,	L,	-1
+	bgt		ZTRMM_L2x2_SUB2
+
+ZTRMM_L2x2_SAVE:
+
+	SAVE2x2
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	5			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	5			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	2				// KK += Number of values in A
+#endif
+
+
+ZTRMM_L2x2_END:
+
+ZTRMM_L2x1_BEGIN:
+
+	andi.		T1,	M,	1
+	ble		ZTRMM_L2x1_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	5				// Number of values in B shifted
+	slwi		T2,	KK,	4				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	1				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	2				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		ZTRMM_L2x1_SUB0
+	cmpwi		cr0,	L,	1
+	ble		ZTRMM_L2x1_SUB4
+
+ZTRMM_L2x1_LOOP_START:
+
+	LOAD2x1_1
+	KERNEL2x1_I1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	addic.		L,	L,	-2
+	ble		ZTRMM_L2x1_LOOP_END
+
+	.align 5
+
+ZTRMM_L2x1_LOOP:
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	addic.		L,	L,	-1
+	bgt		ZTRMM_L2x1_LOOP
+
+ZTRMM_L2x1_LOOP_END:
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_E2
+
+	b		ZTRMM_L2x1_SUB1
+
+ZTRMM_L2x1_SUB4:
+
+	KERNEL2x1_SUBI1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+
+	b		ZTRMM_L2x1_SUB1
+
+ZTRMM_L2x1_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL2x1_SUBI1
+
+	addic.		L,	L,	-1
+	ble		ZTRMM_L2x1_SAVE
+	b		ZTRMM_L2x1_SUB2
+
+ZTRMM_L2x1_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		ZTRMM_L2x1_SAVE
+
+ZTRMM_L2x1_SUB2:
+
+	KERNEL2x1_SUB1
+
+	addic.		L,	L,	-1
+	bgt		ZTRMM_L2x1_SUB2
+
+ZTRMM_L2x1_SAVE:
+
+	SAVE2x1
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	5			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	4			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	1				// KK += Number of values in A
+#endif
+
+
+ZTRMM_L2x1_END:
+
+	slwi		T1,	K,	5
+	add		B,	B,	T1
+
+#if !defined(LEFT)
+	addi		KK,	KK,	2					// KK += Number of values in B
+#endif
+
+
+	addic.		J,	J,	-1
+	bgt		ZTRMM_L2_BEGIN
+
+	andi.		T2,	N,	1
+	ble		L999
+
+ZTRMM_L2_END:
+
+	b		ZTRMM_L1_BEGIN
+
+L999_H1:
+
+	b		L999
+
+ZTRMM_L1_BEGIN:
+
+	andi.		T1,	N,	1
+	ble		ZTRMM_L1_END
+	mr		CO,	C
+	mr		AO,	A
+
+#if defined(LEFT)
+	mr		KK,	OFFSET		// OFFSET -> KK
+#endif
+
+	srawi.		I,	M,	3
+	ble		ZTRMM_L1x8_END
+
+ZTRMM_L1x8_BEGIN:
+
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	4				// Number of values in B shifted
+	slwi		T2,	KK,	7				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	8				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	1				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		ZTRMM_L1x8_SUB0
+	cmpwi		cr0,	L,	1
+	ble		ZTRMM_L1x8_SUB4
+
+ZTRMM_L1x8_LOOP_START:
+
+	dcbt		AO,	PRE
+	LOAD1x8_1
+	dcbt		AO,	PRE
+	KERNEL1x8_I1
+	dcbt		AO,	PRE
+	KERNEL1x8_2
+	dcbt		AO,	PRE
+	KERNEL1x8_1
+	dcbt		AO,	PRE
+	KERNEL1x8_2
+
+	dcbt		AO,	PRE
+	KERNEL1x8_1
+	dcbt		AO,	PRE
+	KERNEL1x8_2
+	dcbt		AO,	PRE
+	KERNEL1x8_1
+	dcbt		AO,	PRE
+	KERNEL1x8_2
+
+	addic.		L,	L,	-2
+	ble		ZTRMM_L1x8_LOOP_END
+
+	.align 5
+
+ZTRMM_L1x8_LOOP:
+
+	dcbt		AO,	PRE
+	KERNEL1x8_1
+	dcbt		AO,	PRE
+	KERNEL1x8_2
+	dcbt		AO,	PRE
+	KERNEL1x8_1
+	dcbt		AO,	PRE
+	KERNEL1x8_2
+
+	dcbt		AO,	PRE
+	KERNEL1x8_1
+	dcbt		AO,	PRE
+	KERNEL1x8_2
+	dcbt		AO,	PRE
+	KERNEL1x8_1
+	dcbt		AO,	PRE
+	KERNEL1x8_2
+
+	addic.		L,	L,	-1
+	bgt		ZTRMM_L1x8_LOOP
+
+ZTRMM_L1x8_LOOP_END:
+
+	dcbt		AO,	PRE
+	KERNEL1x8_1
+	dcbt		AO,	PRE
+	KERNEL1x8_2
+	dcbt		AO,	PRE
+	KERNEL1x8_1
+	dcbt		AO,	PRE
+	KERNEL1x8_2
+
+	dcbt		AO,	PRE
+	KERNEL1x8_1
+	dcbt		AO,	PRE
+	KERNEL1x8_2
+	dcbt		AO,	PRE
+	KERNEL1x8_1
+	KERNEL1x8_E2
+
+	b		ZTRMM_L1x8_SUB1
+
+ZTRMM_L1x8_SUB4:
+
+	dcbt		AO,	PRE
+	KERNEL1x8_SUBI1
+	dcbt		AO,	PRE
+	KERNEL1x8_SUB1
+	dcbt		AO,	PRE
+	KERNEL1x8_SUB1
+	dcbt		AO,	PRE
+	KERNEL1x8_SUB1
+
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+
+	b		ZTRMM_L1x8_SUB1
+
+ZTRMM_L1x8_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL1x8_SUBI1
+
+	addic.		L,	L,	-1
+	ble		ZTRMM_L1x8_SAVE
+	b		ZTRMM_L1x8_SUB2
+
+ZTRMM_L1x8_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		ZTRMM_L1x8_SAVE
+
+ZTRMM_L1x8_SUB2:
+
+	KERNEL1x8_SUB1
+
+	addic.		L,	L,	-1
+	bgt		ZTRMM_L1x8_SUB2
+
+ZTRMM_L1x8_SAVE:
+
+	SAVE1x8
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	4			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	7			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	8				// KK += Number of values in A
+#endif
+
+
+	addic.		I,	I,	-1
+	bgt		ZTRMM_L1x8_BEGIN
+
+ZTRMM_L1x8_END:
+
+ZTRMM_L1x4_BEGIN:
+	andi.		T2,	M,	7
+	ble		ZTRMM_L1x1_END
+
+	andi.		T1,	M,	4
+	ble		ZTRMM_L1x4_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	4				// Number of values in B shifted
+	slwi		T2,	KK,	6				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	4				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	1				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		ZTRMM_L1x4_SUB0
+	cmpwi		cr0,	L,	1
+	ble		ZTRMM_L1x4_SUB4
+
+ZTRMM_L1x4_LOOP_START:
+
+	LOAD1x4_1
+	KERNEL1x4_I1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	addic.		L,	L,	-2
+	ble		ZTRMM_L1x4_LOOP_END
+
+	.align 5
+
+ZTRMM_L1x4_LOOP:
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	addic.		L,	L,	-1
+	bgt		ZTRMM_L1x4_LOOP
+
+ZTRMM_L1x4_LOOP_END:
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_E2
+
+	b		ZTRMM_L1x4_SUB1
+
+ZTRMM_L1x4_SUB4:
+
+	KERNEL1x4_SUBI1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+
+	b		ZTRMM_L1x4_SUB1
+
+ZTRMM_L1x4_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL1x4_SUBI1
+
+	addic.		L,	L,	-1
+	ble		ZTRMM_L1x4_SAVE
+	b		ZTRMM_L1x4_SUB2
+
+ZTRMM_L1x4_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		ZTRMM_L1x4_SAVE
+
+ZTRMM_L1x4_SUB2:
+
+	KERNEL1x4_SUB1
+
+	addic.		L,	L,	-1
+	bgt		ZTRMM_L1x4_SUB2
+
+ZTRMM_L1x4_SAVE:
+
+	SAVE1x4
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	4			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	6			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	4				// KK += Number of values in A
+#endif
+
+
+ZTRMM_L1x4_END:
+
+ZTRMM_L1x2_BEGIN:
+
+	andi.		T1,	M,	2
+	ble		ZTRMM_L1x2_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	4				// Number of values in B shifted
+	slwi		T2,	KK,	5				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	2				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	1				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		ZTRMM_L1x2_SUB0
+	cmpwi		cr0,	L,	1
+	ble		ZTRMM_L1x2_SUB4
+
+ZTRMM_L1x2_LOOP_START:
+
+	LOAD1x2_1
+	KERNEL1x2_I1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	addic.		L,	L,	-2
+	ble		ZTRMM_L1x2_LOOP_END
+
+	.align 5
+
+ZTRMM_L1x2_LOOP:
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	addic.		L,	L,	-1
+	bgt		ZTRMM_L1x2_LOOP
+
+ZTRMM_L1x2_LOOP_END:
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_E2
+
+	b		ZTRMM_L1x2_SUB1
+
+ZTRMM_L1x2_SUB4:
+
+	KERNEL1x2_SUBI1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+
+	b		ZTRMM_L1x2_SUB1
+
+ZTRMM_L1x2_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL1x2_SUBI1
+
+	addic.		L,	L,	-1
+	ble		ZTRMM_L1x2_SAVE
+	b		ZTRMM_L1x2_SUB2
+
+ZTRMM_L1x2_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		ZTRMM_L1x2_SAVE
+
+ZTRMM_L1x2_SUB2:
+
+	KERNEL1x2_SUB1
+
+	addic.		L,	L,	-1
+	bgt		ZTRMM_L1x2_SUB2
+
+ZTRMM_L1x2_SAVE:
+
+	SAVE1x2
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	4			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	5			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	2				// KK += Number of values in A
+#endif
+
+
+ZTRMM_L1x2_END:
+
+ZTRMM_L1x1_BEGIN:
+
+	andi.		T1,	M,	1
+	ble		ZTRMM_L1x1_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	4				// Number of values in B shifted
+	slwi		T2,	KK,	4				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	1				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	1				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		ZTRMM_L1x1_SUB0
+	cmpwi		cr0,	L,	1
+	ble		ZTRMM_L1x1_SUB4
+
+ZTRMM_L1x1_LOOP_START:
+
+	LOAD1x1_1
+	KERNEL1x1_I1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	addic.		L,	L,	-2
+	ble		ZTRMM_L1x1_LOOP_END
+
+	.align 5
+
+ZTRMM_L1x1_LOOP:
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	addic.		L,	L,	-1
+	bgt		ZTRMM_L1x1_LOOP
+
+ZTRMM_L1x1_LOOP_END:
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_E2
+
+	b		ZTRMM_L1x1_SUB1
+
+ZTRMM_L1x1_SUB4:
+
+	KERNEL1x1_SUBI1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+
+	b		ZTRMM_L1x1_SUB1
+
+ZTRMM_L1x1_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL1x1_SUBI1
+
+	addic.		L,	L,	-1
+	ble		ZTRMM_L1x1_SAVE
+	b		ZTRMM_L1x1_SUB2
+
+ZTRMM_L1x1_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		ZTRMM_L1x1_SAVE
+
+ZTRMM_L1x1_SUB2:
+
+	KERNEL1x1_SUB1
+
+	addic.		L,	L,	-1
+	bgt		ZTRMM_L1x1_SUB2
+
+ZTRMM_L1x1_SAVE:
+
+	SAVE1x1
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	4			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	4			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	1				// KK += Number of values in A
+#endif
+
+
+ZTRMM_L1x1_END:
+
+#if !defined(LEFT)
+	addi		KK,	KK,	1					// KK += Number of values in B
+#endif
+
+
+ZTRMM_L1_END:
diff --git a/param.h b/param.h
index c46a1e9..e7dca2c 100644
--- a/param.h
+++ b/param.h
@@ -1962,35 +1962,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if defined(POWER8)
 
 #define SNUMOPT		4
-#define DNUMOPT		4
+#define DNUMOPT		8
 
 #define GEMM_DEFAULT_OFFSET_A  384
 #define GEMM_DEFAULT_OFFSET_B 1024
 #define GEMM_DEFAULT_ALIGN 0x03fffUL
 
-#define SGEMM_DEFAULT_UNROLL_M 4
-#define SGEMM_DEFAULT_UNROLL_N 4
-#define DGEMM_DEFAULT_UNROLL_M 4
+#define SGEMM_DEFAULT_UNROLL_M 2
+#define SGEMM_DEFAULT_UNROLL_N 2
+#define DGEMM_DEFAULT_UNROLL_M 16
 #define DGEMM_DEFAULT_UNROLL_N 4
 #define CGEMM_DEFAULT_UNROLL_M 2
-#define CGEMM_DEFAULT_UNROLL_N 4
-#define ZGEMM_DEFAULT_UNROLL_M 2
-#define ZGEMM_DEFAULT_UNROLL_N 4
+#define CGEMM_DEFAULT_UNROLL_N 2
+#define ZGEMM_DEFAULT_UNROLL_M 8
+#define ZGEMM_DEFAULT_UNROLL_N 2
 
 #define SGEMM_DEFAULT_P  992
 #define DGEMM_DEFAULT_P  480
 #define CGEMM_DEFAULT_P  488
-#define ZGEMM_DEFAULT_P  248
+#define ZGEMM_DEFAULT_P  240
 
 #define SGEMM_DEFAULT_Q  504
-#define DGEMM_DEFAULT_Q  504
+#define DGEMM_DEFAULT_Q  720
 #define CGEMM_DEFAULT_Q  400
-#define ZGEMM_DEFAULT_Q  400
+#define ZGEMM_DEFAULT_Q  360
+
+#define DGEMM_DEFAULT_R 14400
+#define ZGEMM_DEFAULT_R 7200
 
 #define SYMV_P	 8
 
 #endif
 
+
 #if defined(SPARC) && defined(V7)
 
 #define SNUMOPT		4